loop-unroll.c (struct iv_to_split): Remove n_loc and loc fields.
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling and peeling.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "basic-block.h"
29 #include "cfgloop.h"
30 #include "params.h"
31 #include "expr.h"
32 #include "hash-table.h"
33 #include "recog.h"
34 #include "target.h"
35 #include "dumpfile.h"
36
37 /* This pass performs loop unrolling and peeling. We only perform these
38 optimizations on innermost loops (with single exception) because
39 the impact on performance is greatest here, and we want to avoid
40 unnecessary code size growth. The gain is caused by greater sequentiality
41 of code, better code to optimize for further passes and in some cases
42 by fewer testings of exit conditions. The main problem is code growth,
43 that impacts performance negatively due to effect of caches.
44
45 What we do:
46
47 -- complete peeling of once-rolling loops; this is the above mentioned
48 exception, as this causes loop to be cancelled completely and
49 does not cause code growth
50 -- complete peeling of loops that roll (small) constant times.
51 -- simple peeling of first iterations of loops that do not roll much
52 (according to profile feedback)
53 -- unrolling of loops that roll constant times; this is almost always
54 win, as we get rid of exit condition tests.
55 -- unrolling of loops that roll number of times that we can compute
56 in runtime; we also get rid of exit condition tests here, but there
57 is the extra expense for calculating the number of iterations
58 -- simple unrolling of remaining loops; this is performed only if we
59 are asked to, as the gain is questionable in this case and often
60 it may even slow down the code
61 For more detailed descriptions of each of those, see comments at
62 appropriate function below.
63
64 There is a lot of parameters (defined and described in params.def) that
65 control how much we unroll/peel.
66
67 ??? A great problem is that we don't have a good way how to determine
68 how many times we should unroll the loop; the experiments I have made
69 showed that this choice may affect performance in order of several %.
70 */
71
72 /* Information about induction variables to split. */
73
74 struct iv_to_split
75 {
76 rtx insn; /* The insn in that the induction variable occurs. */
77 rtx orig_var; /* The variable (register) for the IV before split. */
78 rtx base_var; /* The variable on that the values in the further
79 iterations are based. */
80 rtx step; /* Step of the induction variable. */
81 struct iv_to_split *next; /* Next entry in walking order. */
82 };
83
84 /* Information about accumulators to expand. */
85
86 struct var_to_expand
87 {
88 rtx insn; /* The insn in that the variable expansion occurs. */
89 rtx reg; /* The accumulator which is expanded. */
90 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
91 struct var_to_expand *next; /* Next entry in walking order. */
92 enum rtx_code op; /* The type of the accumulation - addition, subtraction
93 or multiplication. */
94 int expansion_count; /* Count the number of expansions generated so far. */
95 int reuse_expansion; /* The expansion we intend to reuse to expand
96 the accumulator. If REUSE_EXPANSION is 0 reuse
97 the original accumulator. Else use
98 var_expansions[REUSE_EXPANSION - 1]. */
99 };
100
101 /* Hashtable helper for iv_to_split. */
102
103 struct iv_split_hasher : typed_free_remove <iv_to_split>
104 {
105 typedef iv_to_split value_type;
106 typedef iv_to_split compare_type;
107 static inline hashval_t hash (const value_type *);
108 static inline bool equal (const value_type *, const compare_type *);
109 };
110
111
112 /* A hash function for information about insns to split. */
113
114 inline hashval_t
115 iv_split_hasher::hash (const value_type *ivts)
116 {
117 return (hashval_t) INSN_UID (ivts->insn);
118 }
119
120 /* An equality functions for information about insns to split. */
121
122 inline bool
123 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
124 {
125 return i1->insn == i2->insn;
126 }
127
128 /* Hashtable helper for iv_to_split. */
129
130 struct var_expand_hasher : typed_free_remove <var_to_expand>
131 {
132 typedef var_to_expand value_type;
133 typedef var_to_expand compare_type;
134 static inline hashval_t hash (const value_type *);
135 static inline bool equal (const value_type *, const compare_type *);
136 };
137
138 /* Return a hash for VES. */
139
140 inline hashval_t
141 var_expand_hasher::hash (const value_type *ves)
142 {
143 return (hashval_t) INSN_UID (ves->insn);
144 }
145
146 /* Return true if I1 and I2 refer to the same instruction. */
147
148 inline bool
149 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
150 {
151 return i1->insn == i2->insn;
152 }
153
154 /* Information about optimization applied in
155 the unrolled loop. */
156
157 struct opt_info
158 {
159 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
160 split. */
161 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
162 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
163 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
164 insns with accumulators to expand. */
165 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
166 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
167 unsigned first_new_block; /* The first basic block that was
168 duplicated. */
169 basic_block loop_exit; /* The loop exit basic block. */
170 basic_block loop_preheader; /* The loop preheader basic block. */
171 };
172
173 static void decide_unrolling_and_peeling (int);
174 static void peel_loops_completely (int);
175 static void decide_peel_simple (struct loop *, int);
176 static void decide_peel_once_rolling (struct loop *, int);
177 static void decide_peel_completely (struct loop *, int);
178 static void decide_unroll_stupid (struct loop *, int);
179 static void decide_unroll_constant_iterations (struct loop *, int);
180 static void decide_unroll_runtime_iterations (struct loop *, int);
181 static void peel_loop_simple (struct loop *);
182 static void peel_loop_completely (struct loop *);
183 static void unroll_loop_stupid (struct loop *);
184 static void unroll_loop_constant_iterations (struct loop *);
185 static void unroll_loop_runtime_iterations (struct loop *);
186 static struct opt_info *analyze_insns_in_loop (struct loop *);
187 static void opt_info_start_duplication (struct opt_info *);
188 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
189 static void free_opt_info (struct opt_info *);
190 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
191 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
192 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
193 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
194 static void insert_var_expansion_initialization (struct var_to_expand *,
195 basic_block);
196 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
197 basic_block);
198 static rtx get_expansion (struct var_to_expand *);
199
200 /* Emit a message summarizing the unroll or peel that will be
201 performed for LOOP, along with the loop's location LOCUS, if
202 appropriate given the dump or -fopt-info settings. */
203
204 static void
205 report_unroll_peel (struct loop *loop, location_t locus)
206 {
207 struct niter_desc *desc;
208 int niters = 0;
209 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
210
211 if (loop->lpt_decision.decision == LPT_NONE)
212 return;
213
214 if (!dump_enabled_p ())
215 return;
216
217 /* In the special case where the loop never iterated, emit
218 a different message so that we don't report an unroll by 0.
219 This matches the equivalent message emitted during tree unrolling. */
220 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
221 && !loop->lpt_decision.times)
222 {
223 dump_printf_loc (report_flags, locus,
224 "loop turned into non-loop; it never loops.\n");
225 return;
226 }
227
228 desc = get_simple_loop_desc (loop);
229
230 if (desc->const_iter)
231 niters = desc->niter;
232 else if (loop->header->count)
233 niters = expected_loop_iterations (loop);
234
235 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
236 dump_printf_loc (report_flags, locus,
237 "loop with %d iterations completely unrolled",
238 loop->lpt_decision.times + 1);
239 else
240 dump_printf_loc (report_flags, locus,
241 "loop %s %d times",
242 (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
243 ? "peeled" : "unrolled"),
244 loop->lpt_decision.times);
245 if (profile_info)
246 dump_printf (report_flags,
247 " (header execution count %d",
248 (int)loop->header->count);
249 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
250 dump_printf (report_flags,
251 "%s%s iterations %d)",
252 profile_info ? ", " : " (",
253 desc->const_iter ? "const" : "average",
254 niters);
255 else if (profile_info)
256 dump_printf (report_flags, ")");
257
258 dump_printf (report_flags, "\n");
259 }
260
261 /* Unroll and/or peel (depending on FLAGS) LOOPS. */
262 void
263 unroll_and_peel_loops (int flags)
264 {
265 struct loop *loop;
266 bool changed = false;
267
268 /* First perform complete loop peeling (it is almost surely a win,
269 and affects parameters for further decision a lot). */
270 peel_loops_completely (flags);
271
272 /* Now decide rest of unrolling and peeling. */
273 decide_unrolling_and_peeling (flags);
274
275 /* Scan the loops, inner ones first. */
276 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
277 {
278 /* And perform the appropriate transformations. */
279 switch (loop->lpt_decision.decision)
280 {
281 case LPT_PEEL_COMPLETELY:
282 /* Already done. */
283 gcc_unreachable ();
284 case LPT_PEEL_SIMPLE:
285 peel_loop_simple (loop);
286 changed = true;
287 break;
288 case LPT_UNROLL_CONSTANT:
289 unroll_loop_constant_iterations (loop);
290 changed = true;
291 break;
292 case LPT_UNROLL_RUNTIME:
293 unroll_loop_runtime_iterations (loop);
294 changed = true;
295 break;
296 case LPT_UNROLL_STUPID:
297 unroll_loop_stupid (loop);
298 changed = true;
299 break;
300 case LPT_NONE:
301 break;
302 default:
303 gcc_unreachable ();
304 }
305 }
306
307 if (changed)
308 {
309 calculate_dominance_info (CDI_DOMINATORS);
310 fix_loop_structure (NULL);
311 }
312
313 iv_analysis_done ();
314 }
315
316 /* Check whether exit of the LOOP is at the end of loop body. */
317
318 static bool
319 loop_exit_at_end_p (struct loop *loop)
320 {
321 struct niter_desc *desc = get_simple_loop_desc (loop);
322 rtx insn;
323
324 if (desc->in_edge->dest != loop->latch)
325 return false;
326
327 /* Check that the latch is empty. */
328 FOR_BB_INSNS (loop->latch, insn)
329 {
330 if (NONDEBUG_INSN_P (insn))
331 return false;
332 }
333
334 return true;
335 }
336
337 /* Depending on FLAGS, check whether to peel loops completely and do so. */
338 static void
339 peel_loops_completely (int flags)
340 {
341 struct loop *loop;
342 bool changed = false;
343
344 /* Scan the loops, the inner ones first. */
345 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
346 {
347 loop->lpt_decision.decision = LPT_NONE;
348 location_t locus = get_loop_location (loop);
349
350 if (dump_enabled_p ())
351 dump_printf_loc (TDF_RTL, locus,
352 ";; *** Considering loop %d at BB %d for "
353 "complete peeling ***\n",
354 loop->num, loop->header->index);
355
356 loop->ninsns = num_loop_insns (loop);
357
358 decide_peel_once_rolling (loop, flags);
359 if (loop->lpt_decision.decision == LPT_NONE)
360 decide_peel_completely (loop, flags);
361
362 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
363 {
364 report_unroll_peel (loop, locus);
365 peel_loop_completely (loop);
366 changed = true;
367 }
368 }
369
370 if (changed)
371 {
372 calculate_dominance_info (CDI_DOMINATORS);
373 fix_loop_structure (NULL);
374 }
375 }
376
377 /* Decide whether unroll or peel loops (depending on FLAGS) and how much. */
378 static void
379 decide_unrolling_and_peeling (int flags)
380 {
381 struct loop *loop;
382
383 /* Scan the loops, inner ones first. */
384 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
385 {
386 loop->lpt_decision.decision = LPT_NONE;
387 location_t locus = get_loop_location (loop);
388
389 if (dump_enabled_p ())
390 dump_printf_loc (TDF_RTL, locus,
391 ";; *** Considering loop %d at BB %d for "
392 "unrolling and peeling ***\n",
393 loop->num, loop->header->index);
394
395 /* Do not peel cold areas. */
396 if (optimize_loop_for_size_p (loop))
397 {
398 if (dump_file)
399 fprintf (dump_file, ";; Not considering loop, cold area\n");
400 continue;
401 }
402
403 /* Can the loop be manipulated? */
404 if (!can_duplicate_loop_p (loop))
405 {
406 if (dump_file)
407 fprintf (dump_file,
408 ";; Not considering loop, cannot duplicate\n");
409 continue;
410 }
411
412 /* Skip non-innermost loops. */
413 if (loop->inner)
414 {
415 if (dump_file)
416 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
417 continue;
418 }
419
420 loop->ninsns = num_loop_insns (loop);
421 loop->av_ninsns = average_num_loop_insns (loop);
422
423 /* Try transformations one by one in decreasing order of
424 priority. */
425
426 decide_unroll_constant_iterations (loop, flags);
427 if (loop->lpt_decision.decision == LPT_NONE)
428 decide_unroll_runtime_iterations (loop, flags);
429 if (loop->lpt_decision.decision == LPT_NONE)
430 decide_unroll_stupid (loop, flags);
431 if (loop->lpt_decision.decision == LPT_NONE)
432 decide_peel_simple (loop, flags);
433
434 report_unroll_peel (loop, locus);
435 }
436 }
437
438 /* Decide whether the LOOP is once rolling and suitable for complete
439 peeling. */
440 static void
441 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
442 {
443 struct niter_desc *desc;
444
445 if (dump_file)
446 fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
447
448 /* Is the loop small enough? */
449 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
450 {
451 if (dump_file)
452 fprintf (dump_file, ";; Not considering loop, is too big\n");
453 return;
454 }
455
456 /* Check for simple loops. */
457 desc = get_simple_loop_desc (loop);
458
459 /* Check number of iterations. */
460 if (!desc->simple_p
461 || desc->assumptions
462 || desc->infinite
463 || !desc->const_iter
464 || (desc->niter != 0
465 && get_max_loop_iterations_int (loop) != 0))
466 {
467 if (dump_file)
468 fprintf (dump_file,
469 ";; Unable to prove that the loop rolls exactly once\n");
470 return;
471 }
472
473 /* Success. */
474 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
475 }
476
477 /* Decide whether the LOOP is suitable for complete peeling. */
478 static void
479 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
480 {
481 unsigned npeel;
482 struct niter_desc *desc;
483
484 if (dump_file)
485 fprintf (dump_file, "\n;; Considering peeling completely\n");
486
487 /* Skip non-innermost loops. */
488 if (loop->inner)
489 {
490 if (dump_file)
491 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
492 return;
493 }
494
495 /* Do not peel cold areas. */
496 if (optimize_loop_for_size_p (loop))
497 {
498 if (dump_file)
499 fprintf (dump_file, ";; Not considering loop, cold area\n");
500 return;
501 }
502
503 /* Can the loop be manipulated? */
504 if (!can_duplicate_loop_p (loop))
505 {
506 if (dump_file)
507 fprintf (dump_file,
508 ";; Not considering loop, cannot duplicate\n");
509 return;
510 }
511
512 /* npeel = number of iterations to peel. */
513 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
514 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
515 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
516
517 /* Is the loop small enough? */
518 if (!npeel)
519 {
520 if (dump_file)
521 fprintf (dump_file, ";; Not considering loop, is too big\n");
522 return;
523 }
524
525 /* Check for simple loops. */
526 desc = get_simple_loop_desc (loop);
527
528 /* Check number of iterations. */
529 if (!desc->simple_p
530 || desc->assumptions
531 || !desc->const_iter
532 || desc->infinite)
533 {
534 if (dump_file)
535 fprintf (dump_file,
536 ";; Unable to prove that the loop iterates constant times\n");
537 return;
538 }
539
540 if (desc->niter > npeel - 1)
541 {
542 if (dump_file)
543 {
544 fprintf (dump_file,
545 ";; Not peeling loop completely, rolls too much (");
546 fprintf (dump_file, "%"PRId64, desc->niter);
547 fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
548 }
549 return;
550 }
551
552 /* Success. */
553 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
554 }
555
556 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
557 completely. The transformation done:
558
559 for (i = 0; i < 4; i++)
560 body;
561
562 ==>
563
564 i = 0;
565 body; i++;
566 body; i++;
567 body; i++;
568 body; i++;
569 */
570 static void
571 peel_loop_completely (struct loop *loop)
572 {
573 sbitmap wont_exit;
574 unsigned HOST_WIDE_INT npeel;
575 unsigned i;
576 edge ein;
577 struct niter_desc *desc = get_simple_loop_desc (loop);
578 struct opt_info *opt_info = NULL;
579
580 npeel = desc->niter;
581
582 if (npeel)
583 {
584 bool ok;
585
586 wont_exit = sbitmap_alloc (npeel + 1);
587 bitmap_ones (wont_exit);
588 bitmap_clear_bit (wont_exit, 0);
589 if (desc->noloop_assumptions)
590 bitmap_clear_bit (wont_exit, 1);
591
592 auto_vec<edge> remove_edges;
593 if (flag_split_ivs_in_unroller)
594 opt_info = analyze_insns_in_loop (loop);
595
596 opt_info_start_duplication (opt_info);
597 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
598 npeel,
599 wont_exit, desc->out_edge,
600 &remove_edges,
601 DLTHE_FLAG_UPDATE_FREQ
602 | DLTHE_FLAG_COMPLETTE_PEEL
603 | (opt_info
604 ? DLTHE_RECORD_COPY_NUMBER : 0));
605 gcc_assert (ok);
606
607 free (wont_exit);
608
609 if (opt_info)
610 {
611 apply_opt_in_copies (opt_info, npeel, false, true);
612 free_opt_info (opt_info);
613 }
614
615 /* Remove the exit edges. */
616 FOR_EACH_VEC_ELT (remove_edges, i, ein)
617 remove_path (ein);
618 }
619
620 ein = desc->in_edge;
621 free_simple_loop_desc (loop);
622
623 /* Now remove the unreachable part of the last iteration and cancel
624 the loop. */
625 remove_path (ein);
626
627 if (dump_file)
628 fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
629 }
630
631 /* Decide whether to unroll LOOP iterating constant number of times
632 and how much. */
633
634 static void
635 decide_unroll_constant_iterations (struct loop *loop, int flags)
636 {
637 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
638 struct niter_desc *desc;
639 widest_int iterations;
640
641 if (!(flags & UAP_UNROLL))
642 {
643 /* We were not asked to, just return back silently. */
644 return;
645 }
646
647 if (dump_file)
648 fprintf (dump_file,
649 "\n;; Considering unrolling loop with constant "
650 "number of iterations\n");
651
652 /* nunroll = total number of copies of the original loop body in
653 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
654 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
655 nunroll_by_av
656 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
657 if (nunroll > nunroll_by_av)
658 nunroll = nunroll_by_av;
659 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
660 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
661
662 if (targetm.loop_unroll_adjust)
663 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
664
665 /* Skip big loops. */
666 if (nunroll <= 1)
667 {
668 if (dump_file)
669 fprintf (dump_file, ";; Not considering loop, is too big\n");
670 return;
671 }
672
673 /* Check for simple loops. */
674 desc = get_simple_loop_desc (loop);
675
676 /* Check number of iterations. */
677 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
678 {
679 if (dump_file)
680 fprintf (dump_file,
681 ";; Unable to prove that the loop iterates constant times\n");
682 return;
683 }
684
685 /* Check whether the loop rolls enough to consider.
686 Consult also loop bounds and profile; in the case the loop has more
687 than one exit it may well loop less than determined maximal number
688 of iterations. */
689 if (desc->niter < 2 * nunroll
690 || ((get_estimated_loop_iterations (loop, &iterations)
691 || get_max_loop_iterations (loop, &iterations))
692 && wi::ltu_p (iterations, 2 * nunroll)))
693 {
694 if (dump_file)
695 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
696 return;
697 }
698
699 /* Success; now compute number of iterations to unroll. We alter
700 nunroll so that as few as possible copies of loop body are
701 necessary, while still not decreasing the number of unrollings
702 too much (at most by 1). */
703 best_copies = 2 * nunroll + 10;
704
705 i = 2 * nunroll + 2;
706 if (i - 1 >= desc->niter)
707 i = desc->niter - 2;
708
709 for (; i >= nunroll - 1; i--)
710 {
711 unsigned exit_mod = desc->niter % (i + 1);
712
713 if (!loop_exit_at_end_p (loop))
714 n_copies = exit_mod + i + 1;
715 else if (exit_mod != (unsigned) i
716 || desc->noloop_assumptions != NULL_RTX)
717 n_copies = exit_mod + i + 2;
718 else
719 n_copies = i + 1;
720
721 if (n_copies < best_copies)
722 {
723 best_copies = n_copies;
724 best_unroll = i;
725 }
726 }
727
728 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
729 loop->lpt_decision.times = best_unroll;
730 }
731
732 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
733 The transformation does this:
734
735 for (i = 0; i < 102; i++)
736 body;
737
738 ==> (LOOP->LPT_DECISION.TIMES == 3)
739
740 i = 0;
741 body; i++;
742 body; i++;
743 while (i < 102)
744 {
745 body; i++;
746 body; i++;
747 body; i++;
748 body; i++;
749 }
750 */
751 static void
752 unroll_loop_constant_iterations (struct loop *loop)
753 {
754 unsigned HOST_WIDE_INT niter;
755 unsigned exit_mod;
756 sbitmap wont_exit;
757 unsigned i;
758 edge e;
759 unsigned max_unroll = loop->lpt_decision.times;
760 struct niter_desc *desc = get_simple_loop_desc (loop);
761 bool exit_at_end = loop_exit_at_end_p (loop);
762 struct opt_info *opt_info = NULL;
763 bool ok;
764
765 niter = desc->niter;
766
767 /* Should not get here (such loop should be peeled instead). */
768 gcc_assert (niter > max_unroll + 1);
769
770 exit_mod = niter % (max_unroll + 1);
771
772 wont_exit = sbitmap_alloc (max_unroll + 1);
773 bitmap_ones (wont_exit);
774
775 auto_vec<edge> remove_edges;
776 if (flag_split_ivs_in_unroller
777 || flag_variable_expansion_in_unroller)
778 opt_info = analyze_insns_in_loop (loop);
779
780 if (!exit_at_end)
781 {
782 /* The exit is not at the end of the loop; leave exit test
783 in the first copy, so that the loops that start with test
784 of exit condition have continuous body after unrolling. */
785
786 if (dump_file)
787 fprintf (dump_file, ";; Condition at beginning of loop.\n");
788
789 /* Peel exit_mod iterations. */
790 bitmap_clear_bit (wont_exit, 0);
791 if (desc->noloop_assumptions)
792 bitmap_clear_bit (wont_exit, 1);
793
794 if (exit_mod)
795 {
796 opt_info_start_duplication (opt_info);
797 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
798 exit_mod,
799 wont_exit, desc->out_edge,
800 &remove_edges,
801 DLTHE_FLAG_UPDATE_FREQ
802 | (opt_info && exit_mod > 1
803 ? DLTHE_RECORD_COPY_NUMBER
804 : 0));
805 gcc_assert (ok);
806
807 if (opt_info && exit_mod > 1)
808 apply_opt_in_copies (opt_info, exit_mod, false, false);
809
810 desc->noloop_assumptions = NULL_RTX;
811 desc->niter -= exit_mod;
812 loop->nb_iterations_upper_bound -= exit_mod;
813 if (loop->any_estimate
814 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
815 loop->nb_iterations_estimate -= exit_mod;
816 else
817 loop->any_estimate = false;
818 }
819
820 bitmap_set_bit (wont_exit, 1);
821 }
822 else
823 {
824 /* Leave exit test in last copy, for the same reason as above if
825 the loop tests the condition at the end of loop body. */
826
827 if (dump_file)
828 fprintf (dump_file, ";; Condition at end of loop.\n");
829
830 /* We know that niter >= max_unroll + 2; so we do not need to care of
831 case when we would exit before reaching the loop. So just peel
832 exit_mod + 1 iterations. */
833 if (exit_mod != max_unroll
834 || desc->noloop_assumptions)
835 {
836 bitmap_clear_bit (wont_exit, 0);
837 if (desc->noloop_assumptions)
838 bitmap_clear_bit (wont_exit, 1);
839
840 opt_info_start_duplication (opt_info);
841 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
842 exit_mod + 1,
843 wont_exit, desc->out_edge,
844 &remove_edges,
845 DLTHE_FLAG_UPDATE_FREQ
846 | (opt_info && exit_mod > 0
847 ? DLTHE_RECORD_COPY_NUMBER
848 : 0));
849 gcc_assert (ok);
850
851 if (opt_info && exit_mod > 0)
852 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
853
854 desc->niter -= exit_mod + 1;
855 loop->nb_iterations_upper_bound -= exit_mod + 1;
856 if (loop->any_estimate
857 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
858 loop->nb_iterations_estimate -= exit_mod + 1;
859 else
860 loop->any_estimate = false;
861 desc->noloop_assumptions = NULL_RTX;
862
863 bitmap_set_bit (wont_exit, 0);
864 bitmap_set_bit (wont_exit, 1);
865 }
866
867 bitmap_clear_bit (wont_exit, max_unroll);
868 }
869
870 /* Now unroll the loop. */
871
872 opt_info_start_duplication (opt_info);
873 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
874 max_unroll,
875 wont_exit, desc->out_edge,
876 &remove_edges,
877 DLTHE_FLAG_UPDATE_FREQ
878 | (opt_info
879 ? DLTHE_RECORD_COPY_NUMBER
880 : 0));
881 gcc_assert (ok);
882
883 if (opt_info)
884 {
885 apply_opt_in_copies (opt_info, max_unroll, true, true);
886 free_opt_info (opt_info);
887 }
888
889 free (wont_exit);
890
891 if (exit_at_end)
892 {
893 basic_block exit_block = get_bb_copy (desc->in_edge->src);
894 /* Find a new in and out edge; they are in the last copy we have made. */
895
896 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
897 {
898 desc->out_edge = EDGE_SUCC (exit_block, 0);
899 desc->in_edge = EDGE_SUCC (exit_block, 1);
900 }
901 else
902 {
903 desc->out_edge = EDGE_SUCC (exit_block, 1);
904 desc->in_edge = EDGE_SUCC (exit_block, 0);
905 }
906 }
907
908 desc->niter /= max_unroll + 1;
909 loop->nb_iterations_upper_bound
910 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
911 if (loop->any_estimate)
912 loop->nb_iterations_estimate
913 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
914 desc->niter_expr = GEN_INT (desc->niter);
915
916 /* Remove the edges. */
917 FOR_EACH_VEC_ELT (remove_edges, i, e)
918 remove_path (e);
919
920 if (dump_file)
921 fprintf (dump_file,
922 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
923 max_unroll, num_loop_insns (loop));
924 }
925
926 /* Decide whether to unroll LOOP iterating runtime computable number of times
927 and how much. */
928 static void
929 decide_unroll_runtime_iterations (struct loop *loop, int flags)
930 {
931 unsigned nunroll, nunroll_by_av, i;
932 struct niter_desc *desc;
933 widest_int iterations;
934
935 if (!(flags & UAP_UNROLL))
936 {
937 /* We were not asked to, just return back silently. */
938 return;
939 }
940
941 if (dump_file)
942 fprintf (dump_file,
943 "\n;; Considering unrolling loop with runtime "
944 "computable number of iterations\n");
945
946 /* nunroll = total number of copies of the original loop body in
947 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
948 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
949 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
950 if (nunroll > nunroll_by_av)
951 nunroll = nunroll_by_av;
952 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
953 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
954
955 if (targetm.loop_unroll_adjust)
956 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
957
958 /* Skip big loops. */
959 if (nunroll <= 1)
960 {
961 if (dump_file)
962 fprintf (dump_file, ";; Not considering loop, is too big\n");
963 return;
964 }
965
966 /* Check for simple loops. */
967 desc = get_simple_loop_desc (loop);
968
969 /* Check simpleness. */
970 if (!desc->simple_p || desc->assumptions)
971 {
972 if (dump_file)
973 fprintf (dump_file,
974 ";; Unable to prove that the number of iterations "
975 "can be counted in runtime\n");
976 return;
977 }
978
979 if (desc->const_iter)
980 {
981 if (dump_file)
982 fprintf (dump_file, ";; Loop iterates constant times\n");
983 return;
984 }
985
986 /* Check whether the loop rolls. */
987 if ((get_estimated_loop_iterations (loop, &iterations)
988 || get_max_loop_iterations (loop, &iterations))
989 && wi::ltu_p (iterations, 2 * nunroll))
990 {
991 if (dump_file)
992 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
993 return;
994 }
995
996 /* Success; now force nunroll to be power of 2, as we are unable to
997 cope with overflows in computation of number of iterations. */
998 for (i = 1; 2 * i <= nunroll; i *= 2)
999 continue;
1000
1001 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1002 loop->lpt_decision.times = i - 1;
1003 }
1004
1005 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1006 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
1007 and NULL is returned instead. */
1008
1009 basic_block
1010 split_edge_and_insert (edge e, rtx insns)
1011 {
1012 basic_block bb;
1013
1014 if (!insns)
1015 return NULL;
1016 bb = split_edge (e);
1017 emit_insn_after (insns, BB_END (bb));
1018
1019 /* ??? We used to assume that INSNS can contain control flow insns, and
1020 that we had to try to find sub basic blocks in BB to maintain a valid
1021 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
1022 and call break_superblocks when going out of cfglayout mode. But it
1023 turns out that this never happens; and that if it does ever happen,
1024 the verify_flow_info at the end of the RTL loop passes would fail.
1025
1026 There are two reasons why we expected we could have control flow insns
1027 in INSNS. The first is when a comparison has to be done in parts, and
1028 the second is when the number of iterations is computed for loops with
1029 the number of iterations known at runtime. In both cases, test cases
1030 to get control flow in INSNS appear to be impossible to construct:
1031
1032 * If do_compare_rtx_and_jump needs several branches to do comparison
1033 in a mode that needs comparison by parts, we cannot analyze the
1034 number of iterations of the loop, and we never get to unrolling it.
1035
1036 * The code in expand_divmod that was suspected to cause creation of
1037 branching code seems to be only accessed for signed division. The
1038 divisions used by # of iterations analysis are always unsigned.
1039 Problems might arise on architectures that emits branching code
1040 for some operations that may appear in the unroller (especially
1041 for division), but we have no such architectures.
1042
1043 Considering all this, it was decided that we should for now assume
1044 that INSNS can in theory contain control flow insns, but in practice
1045 it never does. So we don't handle the theoretical case, and should
1046 a real failure ever show up, we have a pretty good clue for how to
1047 fix it. */
1048
1049 return bb;
1050 }
1051
1052 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
1053 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
1054 in order to create a jump. */
1055
1056 static rtx
1057 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
1058 rtx cinsn)
1059 {
1060 rtx seq, jump, cond;
1061 enum machine_mode mode;
1062
1063 mode = GET_MODE (op0);
1064 if (mode == VOIDmode)
1065 mode = GET_MODE (op1);
1066
1067 start_sequence ();
1068 if (GET_MODE_CLASS (mode) == MODE_CC)
1069 {
1070 /* A hack -- there seems to be no easy generic way how to make a
1071 conditional jump from a ccmode comparison. */
1072 gcc_assert (cinsn);
1073 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
1074 gcc_assert (GET_CODE (cond) == comp);
1075 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
1076 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
1077 emit_jump_insn (copy_insn (PATTERN (cinsn)));
1078 jump = get_last_insn ();
1079 gcc_assert (JUMP_P (jump));
1080 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
1081 LABEL_NUSES (JUMP_LABEL (jump))++;
1082 redirect_jump (jump, label, 0);
1083 }
1084 else
1085 {
1086 gcc_assert (!cinsn);
1087
1088 op0 = force_operand (op0, NULL_RTX);
1089 op1 = force_operand (op1, NULL_RTX);
1090 do_compare_rtx_and_jump (op0, op1, comp, 0,
1091 mode, NULL_RTX, NULL_RTX, label, -1);
1092 jump = get_last_insn ();
1093 gcc_assert (JUMP_P (jump));
1094 JUMP_LABEL (jump) = label;
1095 LABEL_NUSES (label)++;
1096 }
1097 add_int_reg_note (jump, REG_BR_PROB, prob);
1098
1099 seq = get_insns ();
1100 end_sequence ();
1101
1102 return seq;
1103 }
1104
1105 /* Unroll LOOP for which we are able to count number of iterations in runtime
1106 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
1107 extra care for case n < 0):
1108
1109 for (i = 0; i < n; i++)
1110 body;
1111
1112 ==> (LOOP->LPT_DECISION.TIMES == 3)
1113
1114 i = 0;
1115 mod = n % 4;
1116
1117 switch (mod)
1118 {
1119 case 3:
1120 body; i++;
1121 case 2:
1122 body; i++;
1123 case 1:
1124 body; i++;
1125 case 0: ;
1126 }
1127
1128 while (i < n)
1129 {
1130 body; i++;
1131 body; i++;
1132 body; i++;
1133 body; i++;
1134 }
1135 */
1136 static void
1137 unroll_loop_runtime_iterations (struct loop *loop)
1138 {
1139 rtx old_niter, niter, init_code, branch_code, tmp;
1140 unsigned i, j, p;
1141 basic_block preheader, *body, swtch, ezc_swtch;
1142 sbitmap wont_exit;
1143 int may_exit_copy;
1144 unsigned n_peel;
1145 edge e;
1146 bool extra_zero_check, last_may_exit;
1147 unsigned max_unroll = loop->lpt_decision.times;
1148 struct niter_desc *desc = get_simple_loop_desc (loop);
1149 bool exit_at_end = loop_exit_at_end_p (loop);
1150 struct opt_info *opt_info = NULL;
1151 bool ok;
1152
1153 if (flag_split_ivs_in_unroller
1154 || flag_variable_expansion_in_unroller)
1155 opt_info = analyze_insns_in_loop (loop);
1156
1157 /* Remember blocks whose dominators will have to be updated. */
1158 auto_vec<basic_block> dom_bbs;
1159
1160 body = get_loop_body (loop);
1161 for (i = 0; i < loop->num_nodes; i++)
1162 {
1163 vec<basic_block> ldom;
1164 basic_block bb;
1165
1166 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1167 FOR_EACH_VEC_ELT (ldom, j, bb)
1168 if (!flow_bb_inside_loop_p (loop, bb))
1169 dom_bbs.safe_push (bb);
1170
1171 ldom.release ();
1172 }
1173 free (body);
1174
1175 if (!exit_at_end)
1176 {
1177 /* Leave exit in first copy (for explanation why see comment in
1178 unroll_loop_constant_iterations). */
1179 may_exit_copy = 0;
1180 n_peel = max_unroll - 1;
1181 extra_zero_check = true;
1182 last_may_exit = false;
1183 }
1184 else
1185 {
1186 /* Leave exit in last copy (for explanation why see comment in
1187 unroll_loop_constant_iterations). */
1188 may_exit_copy = max_unroll;
1189 n_peel = max_unroll;
1190 extra_zero_check = false;
1191 last_may_exit = true;
1192 }
1193
1194 /* Get expression for number of iterations. */
1195 start_sequence ();
1196 old_niter = niter = gen_reg_rtx (desc->mode);
1197 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1198 if (tmp != niter)
1199 emit_move_insn (niter, tmp);
1200
1201 /* Count modulo by ANDing it with max_unroll; we use the fact that
1202 the number of unrollings is a power of two, and thus this is correct
1203 even if there is overflow in the computation. */
1204 niter = expand_simple_binop (desc->mode, AND,
1205 niter, gen_int_mode (max_unroll, desc->mode),
1206 NULL_RTX, 0, OPTAB_LIB_WIDEN);
1207
1208 init_code = get_insns ();
1209 end_sequence ();
1210 unshare_all_rtl_in_chain (init_code);
1211
1212 /* Precondition the loop. */
1213 split_edge_and_insert (loop_preheader_edge (loop), init_code);
1214
1215 auto_vec<edge> remove_edges;
1216
1217 wont_exit = sbitmap_alloc (max_unroll + 2);
1218
1219 /* Peel the first copy of loop body (almost always we must leave exit test
1220 here; the only exception is when we have extra zero check and the number
1221 of iterations is reliable. Also record the place of (possible) extra
1222 zero check. */
1223 bitmap_clear (wont_exit);
1224 if (extra_zero_check
1225 && !desc->noloop_assumptions)
1226 bitmap_set_bit (wont_exit, 1);
1227 ezc_swtch = loop_preheader_edge (loop)->src;
1228 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1229 1, wont_exit, desc->out_edge,
1230 &remove_edges,
1231 DLTHE_FLAG_UPDATE_FREQ);
1232 gcc_assert (ok);
1233
1234 /* Record the place where switch will be built for preconditioning. */
1235 swtch = split_edge (loop_preheader_edge (loop));
1236
1237 for (i = 0; i < n_peel; i++)
1238 {
1239 /* Peel the copy. */
1240 bitmap_clear (wont_exit);
1241 if (i != n_peel - 1 || !last_may_exit)
1242 bitmap_set_bit (wont_exit, 1);
1243 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1244 1, wont_exit, desc->out_edge,
1245 &remove_edges,
1246 DLTHE_FLAG_UPDATE_FREQ);
1247 gcc_assert (ok);
1248
1249 /* Create item for switch. */
1250 j = n_peel - i - (extra_zero_check ? 0 : 1);
1251 p = REG_BR_PROB_BASE / (i + 2);
1252
1253 preheader = split_edge (loop_preheader_edge (loop));
1254 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1255 block_label (preheader), p,
1256 NULL_RTX);
1257
1258 /* We rely on the fact that the compare and jump cannot be optimized out,
1259 and hence the cfg we create is correct. */
1260 gcc_assert (branch_code != NULL_RTX);
1261
1262 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1263 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1264 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1265 e = make_edge (swtch, preheader,
1266 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1267 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1268 e->probability = p;
1269 }
1270
1271 if (extra_zero_check)
1272 {
1273 /* Add branch for zero iterations. */
1274 p = REG_BR_PROB_BASE / (max_unroll + 1);
1275 swtch = ezc_swtch;
1276 preheader = split_edge (loop_preheader_edge (loop));
1277 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1278 block_label (preheader), p,
1279 NULL_RTX);
1280 gcc_assert (branch_code != NULL_RTX);
1281
1282 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1283 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1284 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1285 e = make_edge (swtch, preheader,
1286 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1287 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1288 e->probability = p;
1289 }
1290
1291 /* Recount dominators for outer blocks. */
1292 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1293
1294 /* And unroll loop. */
1295
1296 bitmap_ones (wont_exit);
1297 bitmap_clear_bit (wont_exit, may_exit_copy);
1298 opt_info_start_duplication (opt_info);
1299
1300 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1301 max_unroll,
1302 wont_exit, desc->out_edge,
1303 &remove_edges,
1304 DLTHE_FLAG_UPDATE_FREQ
1305 | (opt_info
1306 ? DLTHE_RECORD_COPY_NUMBER
1307 : 0));
1308 gcc_assert (ok);
1309
1310 if (opt_info)
1311 {
1312 apply_opt_in_copies (opt_info, max_unroll, true, true);
1313 free_opt_info (opt_info);
1314 }
1315
1316 free (wont_exit);
1317
1318 if (exit_at_end)
1319 {
1320 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1321 /* Find a new in and out edge; they are in the last copy we have
1322 made. */
1323
1324 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1325 {
1326 desc->out_edge = EDGE_SUCC (exit_block, 0);
1327 desc->in_edge = EDGE_SUCC (exit_block, 1);
1328 }
1329 else
1330 {
1331 desc->out_edge = EDGE_SUCC (exit_block, 1);
1332 desc->in_edge = EDGE_SUCC (exit_block, 0);
1333 }
1334 }
1335
1336 /* Remove the edges. */
1337 FOR_EACH_VEC_ELT (remove_edges, i, e)
1338 remove_path (e);
1339
1340 /* We must be careful when updating the number of iterations due to
1341 preconditioning and the fact that the value must be valid at entry
1342 of the loop. After passing through the above code, we see that
1343 the correct new number of iterations is this: */
1344 gcc_assert (!desc->const_iter);
1345 desc->niter_expr =
1346 simplify_gen_binary (UDIV, desc->mode, old_niter,
1347 gen_int_mode (max_unroll + 1, desc->mode));
1348 loop->nb_iterations_upper_bound
1349 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1350 if (loop->any_estimate)
1351 loop->nb_iterations_estimate
1352 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1353 if (exit_at_end)
1354 {
1355 desc->niter_expr =
1356 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1357 desc->noloop_assumptions = NULL_RTX;
1358 --loop->nb_iterations_upper_bound;
1359 if (loop->any_estimate
1360 && loop->nb_iterations_estimate != 0)
1361 --loop->nb_iterations_estimate;
1362 else
1363 loop->any_estimate = false;
1364 }
1365
1366 if (dump_file)
1367 fprintf (dump_file,
1368 ";; Unrolled loop %d times, counting # of iterations "
1369 "in runtime, %i insns\n",
1370 max_unroll, num_loop_insns (loop));
1371 }
1372
1373 /* Decide whether to simply peel LOOP and how much. */
1374 static void
1375 decide_peel_simple (struct loop *loop, int flags)
1376 {
1377 unsigned npeel;
1378 widest_int iterations;
1379
1380 if (!(flags & UAP_PEEL))
1381 {
1382 /* We were not asked to, just return back silently. */
1383 return;
1384 }
1385
1386 if (dump_file)
1387 fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1388
1389 /* npeel = number of iterations to peel. */
1390 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1391 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1392 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1393
1394 /* Skip big loops. */
1395 if (!npeel)
1396 {
1397 if (dump_file)
1398 fprintf (dump_file, ";; Not considering loop, is too big\n");
1399 return;
1400 }
1401
1402 /* Do not simply peel loops with branches inside -- it increases number
1403 of mispredicts.
1404 Exception is when we do have profile and we however have good chance
1405 to peel proper number of iterations loop will iterate in practice.
1406 TODO: this heuristic needs tunning; while for complette unrolling
1407 the branch inside loop mostly eliminates any improvements, for
1408 peeling it is not the case. Also a function call inside loop is
1409 also branch from branch prediction POV (and probably better reason
1410 to not unroll/peel). */
1411 if (num_loop_branches (loop) > 1
1412 && profile_status_for_fn (cfun) != PROFILE_READ)
1413 {
1414 if (dump_file)
1415 fprintf (dump_file, ";; Not peeling, contains branches\n");
1416 return;
1417 }
1418
1419 /* If we have realistic estimate on number of iterations, use it. */
1420 if (get_estimated_loop_iterations (loop, &iterations))
1421 {
1422 if (wi::leu_p (npeel, iterations))
1423 {
1424 if (dump_file)
1425 {
1426 fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1427 fprintf (dump_file, "%"PRId64,
1428 (int64_t) (iterations.to_shwi () + 1));
1429 fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1430 npeel);
1431 }
1432 return;
1433 }
1434 npeel = iterations.to_shwi () + 1;
1435 }
1436 /* If we have small enough bound on iterations, we can still peel (completely
1437 unroll). */
1438 else if (get_max_loop_iterations (loop, &iterations)
1439 && wi::ltu_p (iterations, npeel))
1440 npeel = iterations.to_shwi () + 1;
1441 else
1442 {
1443 /* For now we have no good heuristics to decide whether loop peeling
1444 will be effective, so disable it. */
1445 if (dump_file)
1446 fprintf (dump_file,
1447 ";; Not peeling loop, no evidence it will be profitable\n");
1448 return;
1449 }
1450
1451 /* Success. */
1452 loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1453 loop->lpt_decision.times = npeel;
1454 }
1455
1456 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1457
1458 while (cond)
1459 body;
1460
1461 ==> (LOOP->LPT_DECISION.TIMES == 3)
1462
1463 if (!cond) goto end;
1464 body;
1465 if (!cond) goto end;
1466 body;
1467 if (!cond) goto end;
1468 body;
1469 while (cond)
1470 body;
1471 end: ;
1472 */
1473 static void
1474 peel_loop_simple (struct loop *loop)
1475 {
1476 sbitmap wont_exit;
1477 unsigned npeel = loop->lpt_decision.times;
1478 struct niter_desc *desc = get_simple_loop_desc (loop);
1479 struct opt_info *opt_info = NULL;
1480 bool ok;
1481
1482 if (flag_split_ivs_in_unroller && npeel > 1)
1483 opt_info = analyze_insns_in_loop (loop);
1484
1485 wont_exit = sbitmap_alloc (npeel + 1);
1486 bitmap_clear (wont_exit);
1487
1488 opt_info_start_duplication (opt_info);
1489
1490 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1491 npeel, wont_exit, NULL,
1492 NULL, DLTHE_FLAG_UPDATE_FREQ
1493 | (opt_info
1494 ? DLTHE_RECORD_COPY_NUMBER
1495 : 0));
1496 gcc_assert (ok);
1497
1498 free (wont_exit);
1499
1500 if (opt_info)
1501 {
1502 apply_opt_in_copies (opt_info, npeel, false, false);
1503 free_opt_info (opt_info);
1504 }
1505
1506 if (desc->simple_p)
1507 {
1508 if (desc->const_iter)
1509 {
1510 desc->niter -= npeel;
1511 desc->niter_expr = GEN_INT (desc->niter);
1512 desc->noloop_assumptions = NULL_RTX;
1513 }
1514 else
1515 {
1516 /* We cannot just update niter_expr, as its value might be clobbered
1517 inside loop. We could handle this by counting the number into
1518 temporary just like we do in runtime unrolling, but it does not
1519 seem worthwhile. */
1520 free_simple_loop_desc (loop);
1521 }
1522 }
1523 if (dump_file)
1524 fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1525 }
1526
1527 /* Decide whether to unroll LOOP stupidly and how much. */
1528 static void
1529 decide_unroll_stupid (struct loop *loop, int flags)
1530 {
1531 unsigned nunroll, nunroll_by_av, i;
1532 struct niter_desc *desc;
1533 widest_int iterations;
1534
1535 if (!(flags & UAP_UNROLL_ALL))
1536 {
1537 /* We were not asked to, just return back silently. */
1538 return;
1539 }
1540
1541 if (dump_file)
1542 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1543
1544 /* nunroll = total number of copies of the original loop body in
1545 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1546 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1547 nunroll_by_av
1548 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1549 if (nunroll > nunroll_by_av)
1550 nunroll = nunroll_by_av;
1551 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1552 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1553
1554 if (targetm.loop_unroll_adjust)
1555 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1556
1557 /* Skip big loops. */
1558 if (nunroll <= 1)
1559 {
1560 if (dump_file)
1561 fprintf (dump_file, ";; Not considering loop, is too big\n");
1562 return;
1563 }
1564
1565 /* Check for simple loops. */
1566 desc = get_simple_loop_desc (loop);
1567
1568 /* Check simpleness. */
1569 if (desc->simple_p && !desc->assumptions)
1570 {
1571 if (dump_file)
1572 fprintf (dump_file, ";; The loop is simple\n");
1573 return;
1574 }
1575
1576 /* Do not unroll loops with branches inside -- it increases number
1577 of mispredicts.
1578 TODO: this heuristic needs tunning; call inside the loop body
1579 is also relatively good reason to not unroll. */
1580 if (num_loop_branches (loop) > 1)
1581 {
1582 if (dump_file)
1583 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1584 return;
1585 }
1586
1587 /* Check whether the loop rolls. */
1588 if ((get_estimated_loop_iterations (loop, &iterations)
1589 || get_max_loop_iterations (loop, &iterations))
1590 && wi::ltu_p (iterations, 2 * nunroll))
1591 {
1592 if (dump_file)
1593 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1594 return;
1595 }
1596
1597 /* Success. Now force nunroll to be power of 2, as it seems that this
1598 improves results (partially because of better alignments, partially
1599 because of some dark magic). */
1600 for (i = 1; 2 * i <= nunroll; i *= 2)
1601 continue;
1602
1603 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1604 loop->lpt_decision.times = i - 1;
1605 }
1606
1607 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1608
1609 while (cond)
1610 body;
1611
1612 ==> (LOOP->LPT_DECISION.TIMES == 3)
1613
1614 while (cond)
1615 {
1616 body;
1617 if (!cond) break;
1618 body;
1619 if (!cond) break;
1620 body;
1621 if (!cond) break;
1622 body;
1623 }
1624 */
1625 static void
1626 unroll_loop_stupid (struct loop *loop)
1627 {
1628 sbitmap wont_exit;
1629 unsigned nunroll = loop->lpt_decision.times;
1630 struct niter_desc *desc = get_simple_loop_desc (loop);
1631 struct opt_info *opt_info = NULL;
1632 bool ok;
1633
1634 if (flag_split_ivs_in_unroller
1635 || flag_variable_expansion_in_unroller)
1636 opt_info = analyze_insns_in_loop (loop);
1637
1638
1639 wont_exit = sbitmap_alloc (nunroll + 1);
1640 bitmap_clear (wont_exit);
1641 opt_info_start_duplication (opt_info);
1642
1643 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1644 nunroll, wont_exit,
1645 NULL, NULL,
1646 DLTHE_FLAG_UPDATE_FREQ
1647 | (opt_info
1648 ? DLTHE_RECORD_COPY_NUMBER
1649 : 0));
1650 gcc_assert (ok);
1651
1652 if (opt_info)
1653 {
1654 apply_opt_in_copies (opt_info, nunroll, true, true);
1655 free_opt_info (opt_info);
1656 }
1657
1658 free (wont_exit);
1659
1660 if (desc->simple_p)
1661 {
1662 /* We indeed may get here provided that there are nontrivial assumptions
1663 for a loop to be really simple. We could update the counts, but the
1664 problem is that we are unable to decide which exit will be taken
1665 (not really true in case the number of iterations is constant,
1666 but no one will do anything with this information, so we do not
1667 worry about it). */
1668 desc->simple_p = false;
1669 }
1670
1671 if (dump_file)
1672 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1673 nunroll, num_loop_insns (loop));
1674 }
1675
1676 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1677 Set *DEBUG_USES to the number of debug insns that reference the
1678 variable. */
1679
1680 bool
1681 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1682 int *debug_uses)
1683 {
1684 basic_block *body, bb;
1685 unsigned i;
1686 int count_ref = 0;
1687 rtx insn;
1688
1689 body = get_loop_body (loop);
1690 for (i = 0; i < loop->num_nodes; i++)
1691 {
1692 bb = body[i];
1693
1694 FOR_BB_INSNS (bb, insn)
1695 if (!rtx_referenced_p (reg, insn))
1696 continue;
1697 else if (DEBUG_INSN_P (insn))
1698 ++*debug_uses;
1699 else if (++count_ref > 1)
1700 break;
1701 }
1702 free (body);
1703 return (count_ref == 1);
1704 }
1705
1706 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1707
1708 static void
1709 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1710 {
1711 basic_block *body, bb;
1712 unsigned i;
1713 rtx insn;
1714
1715 body = get_loop_body (loop);
1716 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1717 {
1718 bb = body[i];
1719
1720 FOR_BB_INSNS (bb, insn)
1721 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1722 continue;
1723 else
1724 {
1725 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1726 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1727 if (!--debug_uses)
1728 break;
1729 }
1730 }
1731 free (body);
1732 }
1733
1734 /* Determine whether INSN contains an accumulator
1735 which can be expanded into separate copies,
1736 one for each copy of the LOOP body.
1737
1738 for (i = 0 ; i < n; i++)
1739 sum += a[i];
1740
1741 ==>
1742
1743 sum += a[i]
1744 ....
1745 i = i+1;
1746 sum1 += a[i]
1747 ....
1748 i = i+1
1749 sum2 += a[i];
1750 ....
1751
1752 Return NULL if INSN contains no opportunity for expansion of accumulator.
1753 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1754 information and return a pointer to it.
1755 */
1756
1757 static struct var_to_expand *
1758 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1759 {
1760 rtx set, dest, src;
1761 struct var_to_expand *ves;
1762 unsigned accum_pos;
1763 enum rtx_code code;
1764 int debug_uses = 0;
1765
1766 set = single_set (insn);
1767 if (!set)
1768 return NULL;
1769
1770 dest = SET_DEST (set);
1771 src = SET_SRC (set);
1772 code = GET_CODE (src);
1773
1774 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1775 return NULL;
1776
1777 if (FLOAT_MODE_P (GET_MODE (dest)))
1778 {
1779 if (!flag_associative_math)
1780 return NULL;
1781 /* In the case of FMA, we're also changing the rounding. */
1782 if (code == FMA && !flag_unsafe_math_optimizations)
1783 return NULL;
1784 }
1785
1786 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1787 in MD. But if there is no optab to generate the insn, we can not
1788 perform the variable expansion. This can happen if an MD provides
1789 an insn but not a named pattern to generate it, for example to avoid
1790 producing code that needs additional mode switches like for x87/mmx.
1791
1792 So we check have_insn_for which looks for an optab for the operation
1793 in SRC. If it doesn't exist, we can't perform the expansion even
1794 though INSN is valid. */
1795 if (!have_insn_for (code, GET_MODE (src)))
1796 return NULL;
1797
1798 if (!REG_P (dest)
1799 && !(GET_CODE (dest) == SUBREG
1800 && REG_P (SUBREG_REG (dest))))
1801 return NULL;
1802
1803 /* Find the accumulator use within the operation. */
1804 if (code == FMA)
1805 {
1806 /* We only support accumulation via FMA in the ADD position. */
1807 if (!rtx_equal_p (dest, XEXP (src, 2)))
1808 return NULL;
1809 accum_pos = 2;
1810 }
1811 else if (rtx_equal_p (dest, XEXP (src, 0)))
1812 accum_pos = 0;
1813 else if (rtx_equal_p (dest, XEXP (src, 1)))
1814 {
1815 /* The method of expansion that we are using; which includes the
1816 initialization of the expansions with zero and the summation of
1817 the expansions at the end of the computation will yield wrong
1818 results for (x = something - x) thus avoid using it in that case. */
1819 if (code == MINUS)
1820 return NULL;
1821 accum_pos = 1;
1822 }
1823 else
1824 return NULL;
1825
1826 /* It must not otherwise be used. */
1827 if (code == FMA)
1828 {
1829 if (rtx_referenced_p (dest, XEXP (src, 0))
1830 || rtx_referenced_p (dest, XEXP (src, 1)))
1831 return NULL;
1832 }
1833 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1834 return NULL;
1835
1836 /* It must be used in exactly one insn. */
1837 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1838 return NULL;
1839
1840 if (dump_file)
1841 {
1842 fprintf (dump_file, "\n;; Expanding Accumulator ");
1843 print_rtl (dump_file, dest);
1844 fprintf (dump_file, "\n");
1845 }
1846
1847 if (debug_uses)
1848 /* Instead of resetting the debug insns, we could replace each
1849 debug use in the loop with the sum or product of all expanded
1850 accummulators. Since we'll only know of all expansions at the
1851 end, we'd have to keep track of which vars_to_expand a debug
1852 insn in the loop references, take note of each copy of the
1853 debug insn during unrolling, and when it's all done, compute
1854 the sum or product of each variable and adjust the original
1855 debug insn and each copy thereof. What a pain! */
1856 reset_debug_uses_in_loop (loop, dest, debug_uses);
1857
1858 /* Record the accumulator to expand. */
1859 ves = XNEW (struct var_to_expand);
1860 ves->insn = insn;
1861 ves->reg = copy_rtx (dest);
1862 ves->var_expansions.create (1);
1863 ves->next = NULL;
1864 ves->op = GET_CODE (src);
1865 ves->expansion_count = 0;
1866 ves->reuse_expansion = 0;
1867 return ves;
1868 }
1869
1870 /* Determine whether there is an induction variable in INSN that
1871 we would like to split during unrolling.
1872
1873 I.e. replace
1874
1875 i = i + 1;
1876 ...
1877 i = i + 1;
1878 ...
1879 i = i + 1;
1880 ...
1881
1882 type chains by
1883
1884 i0 = i + 1
1885 ...
1886 i = i0 + 1
1887 ...
1888 i = i0 + 2
1889 ...
1890
1891 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1892 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1893 pointer to it. */
1894
1895 static struct iv_to_split *
1896 analyze_iv_to_split_insn (rtx insn)
1897 {
1898 rtx set, dest;
1899 struct rtx_iv iv;
1900 struct iv_to_split *ivts;
1901 bool ok;
1902
1903 /* For now we just split the basic induction variables. Later this may be
1904 extended for example by selecting also addresses of memory references. */
1905 set = single_set (insn);
1906 if (!set)
1907 return NULL;
1908
1909 dest = SET_DEST (set);
1910 if (!REG_P (dest))
1911 return NULL;
1912
1913 if (!biv_p (insn, dest))
1914 return NULL;
1915
1916 ok = iv_analyze_result (insn, dest, &iv);
1917
1918 /* This used to be an assert under the assumption that if biv_p returns
1919 true that iv_analyze_result must also return true. However, that
1920 assumption is not strictly correct as evidenced by pr25569.
1921
1922 Returning NULL when iv_analyze_result returns false is safe and
1923 avoids the problems in pr25569 until the iv_analyze_* routines
1924 can be fixed, which is apparently hard and time consuming
1925 according to their author. */
1926 if (! ok)
1927 return NULL;
1928
1929 if (iv.step == const0_rtx
1930 || iv.mode != iv.extend_mode)
1931 return NULL;
1932
1933 /* Record the insn to split. */
1934 ivts = XNEW (struct iv_to_split);
1935 ivts->insn = insn;
1936 ivts->orig_var = dest;
1937 ivts->base_var = NULL_RTX;
1938 ivts->step = iv.step;
1939 ivts->next = NULL;
1940
1941 return ivts;
1942 }
1943
1944 /* Determines which of insns in LOOP can be optimized.
1945 Return a OPT_INFO struct with the relevant hash tables filled
1946 with all insns to be optimized. The FIRST_NEW_BLOCK field
1947 is undefined for the return value. */
1948
1949 static struct opt_info *
1950 analyze_insns_in_loop (struct loop *loop)
1951 {
1952 basic_block *body, bb;
1953 unsigned i;
1954 struct opt_info *opt_info = XCNEW (struct opt_info);
1955 rtx insn;
1956 struct iv_to_split *ivts = NULL;
1957 struct var_to_expand *ves = NULL;
1958 iv_to_split **slot1;
1959 var_to_expand **slot2;
1960 vec<edge> edges = get_loop_exit_edges (loop);
1961 edge exit;
1962 bool can_apply = false;
1963
1964 iv_analysis_loop_init (loop);
1965
1966 body = get_loop_body (loop);
1967
1968 if (flag_split_ivs_in_unroller)
1969 {
1970 opt_info->insns_to_split
1971 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1972 opt_info->iv_to_split_head = NULL;
1973 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1974 }
1975
1976 /* Record the loop exit bb and loop preheader before the unrolling. */
1977 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1978
1979 if (edges.length () == 1)
1980 {
1981 exit = edges[0];
1982 if (!(exit->flags & EDGE_COMPLEX))
1983 {
1984 opt_info->loop_exit = split_edge (exit);
1985 can_apply = true;
1986 }
1987 }
1988
1989 if (flag_variable_expansion_in_unroller
1990 && can_apply)
1991 {
1992 opt_info->insns_with_var_to_expand
1993 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1994 opt_info->var_to_expand_head = NULL;
1995 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1996 }
1997
1998 for (i = 0; i < loop->num_nodes; i++)
1999 {
2000 bb = body[i];
2001 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
2002 continue;
2003
2004 FOR_BB_INSNS (bb, insn)
2005 {
2006 if (!INSN_P (insn))
2007 continue;
2008
2009 if (opt_info->insns_to_split)
2010 ivts = analyze_iv_to_split_insn (insn);
2011
2012 if (ivts)
2013 {
2014 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
2015 gcc_assert (*slot1 == NULL);
2016 *slot1 = ivts;
2017 *opt_info->iv_to_split_tail = ivts;
2018 opt_info->iv_to_split_tail = &ivts->next;
2019 continue;
2020 }
2021
2022 if (opt_info->insns_with_var_to_expand)
2023 ves = analyze_insn_to_expand_var (loop, insn);
2024
2025 if (ves)
2026 {
2027 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
2028 gcc_assert (*slot2 == NULL);
2029 *slot2 = ves;
2030 *opt_info->var_to_expand_tail = ves;
2031 opt_info->var_to_expand_tail = &ves->next;
2032 }
2033 }
2034 }
2035
2036 edges.release ();
2037 free (body);
2038 return opt_info;
2039 }
2040
2041 /* Called just before loop duplication. Records start of duplicated area
2042 to OPT_INFO. */
2043
2044 static void
2045 opt_info_start_duplication (struct opt_info *opt_info)
2046 {
2047 if (opt_info)
2048 opt_info->first_new_block = last_basic_block_for_fn (cfun);
2049 }
2050
2051 /* Determine the number of iterations between initialization of the base
2052 variable and the current copy (N_COPY). N_COPIES is the total number
2053 of newly created copies. UNROLLING is true if we are unrolling
2054 (not peeling) the loop. */
2055
2056 static unsigned
2057 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2058 {
2059 if (unrolling)
2060 {
2061 /* If we are unrolling, initialization is done in the original loop
2062 body (number 0). */
2063 return n_copy;
2064 }
2065 else
2066 {
2067 /* If we are peeling, the copy in that the initialization occurs has
2068 number 1. The original loop (number 0) is the last. */
2069 if (n_copy)
2070 return n_copy - 1;
2071 else
2072 return n_copies;
2073 }
2074 }
2075
2076 /* Allocate basic variable for the induction variable chain. */
2077
2078 static void
2079 allocate_basic_variable (struct iv_to_split *ivts)
2080 {
2081 rtx expr = SET_SRC (single_set (ivts->insn));
2082
2083 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2084 }
2085
2086 /* Insert initialization of basic variable of IVTS before INSN, taking
2087 the initial value from INSN. */
2088
2089 static void
2090 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2091 {
2092 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
2093 rtx seq;
2094
2095 start_sequence ();
2096 expr = force_operand (expr, ivts->base_var);
2097 if (expr != ivts->base_var)
2098 emit_move_insn (ivts->base_var, expr);
2099 seq = get_insns ();
2100 end_sequence ();
2101
2102 emit_insn_before (seq, insn);
2103 }
2104
2105 /* Replace the use of induction variable described in IVTS in INSN
2106 by base variable + DELTA * step. */
2107
2108 static void
2109 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2110 {
2111 rtx expr, *loc, seq, incr, var;
2112 enum machine_mode mode = GET_MODE (ivts->base_var);
2113 rtx src, dest, set;
2114
2115 /* Construct base + DELTA * step. */
2116 if (!delta)
2117 expr = ivts->base_var;
2118 else
2119 {
2120 incr = simplify_gen_binary (MULT, mode,
2121 ivts->step, gen_int_mode (delta, mode));
2122 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2123 ivts->base_var, incr);
2124 }
2125
2126 /* Figure out where to do the replacement. */
2127 loc = &SET_SRC (single_set (insn));
2128
2129 /* If we can make the replacement right away, we're done. */
2130 if (validate_change (insn, loc, expr, 0))
2131 return;
2132
2133 /* Otherwise, force EXPR into a register and try again. */
2134 start_sequence ();
2135 var = gen_reg_rtx (mode);
2136 expr = force_operand (expr, var);
2137 if (expr != var)
2138 emit_move_insn (var, expr);
2139 seq = get_insns ();
2140 end_sequence ();
2141 emit_insn_before (seq, insn);
2142
2143 if (validate_change (insn, loc, var, 0))
2144 return;
2145
2146 /* The last chance. Try recreating the assignment in insn
2147 completely from scratch. */
2148 set = single_set (insn);
2149 gcc_assert (set);
2150
2151 start_sequence ();
2152 *loc = var;
2153 src = copy_rtx (SET_SRC (set));
2154 dest = copy_rtx (SET_DEST (set));
2155 src = force_operand (src, dest);
2156 if (src != dest)
2157 emit_move_insn (dest, src);
2158 seq = get_insns ();
2159 end_sequence ();
2160
2161 emit_insn_before (seq, insn);
2162 delete_insn (insn);
2163 }
2164
2165
2166 /* Return one expansion of the accumulator recorded in struct VE. */
2167
2168 static rtx
2169 get_expansion (struct var_to_expand *ve)
2170 {
2171 rtx reg;
2172
2173 if (ve->reuse_expansion == 0)
2174 reg = ve->reg;
2175 else
2176 reg = ve->var_expansions[ve->reuse_expansion - 1];
2177
2178 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2179 ve->reuse_expansion = 0;
2180 else
2181 ve->reuse_expansion++;
2182
2183 return reg;
2184 }
2185
2186
2187 /* Given INSN replace the uses of the accumulator recorded in VE
2188 with a new register. */
2189
2190 static void
2191 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2192 {
2193 rtx new_reg, set;
2194 bool really_new_expansion = false;
2195
2196 set = single_set (insn);
2197 gcc_assert (set);
2198
2199 /* Generate a new register only if the expansion limit has not been
2200 reached. Else reuse an already existing expansion. */
2201 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2202 {
2203 really_new_expansion = true;
2204 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2205 }
2206 else
2207 new_reg = get_expansion (ve);
2208
2209 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2210 if (apply_change_group ())
2211 if (really_new_expansion)
2212 {
2213 ve->var_expansions.safe_push (new_reg);
2214 ve->expansion_count++;
2215 }
2216 }
2217
2218 /* Initialize the variable expansions in loop preheader. PLACE is the
2219 loop-preheader basic block where the initialization of the
2220 expansions should take place. The expansions are initialized with
2221 (-0) when the operation is plus or minus to honor sign zero. This
2222 way we can prevent cases where the sign of the final result is
2223 effected by the sign of the expansion. Here is an example to
2224 demonstrate this:
2225
2226 for (i = 0 ; i < n; i++)
2227 sum += something;
2228
2229 ==>
2230
2231 sum += something
2232 ....
2233 i = i+1;
2234 sum1 += something
2235 ....
2236 i = i+1
2237 sum2 += something;
2238 ....
2239
2240 When SUM is initialized with -zero and SOMETHING is also -zero; the
2241 final result of sum should be -zero thus the expansions sum1 and sum2
2242 should be initialized with -zero as well (otherwise we will get +zero
2243 as the final result). */
2244
2245 static void
2246 insert_var_expansion_initialization (struct var_to_expand *ve,
2247 basic_block place)
2248 {
2249 rtx seq, var, zero_init;
2250 unsigned i;
2251 enum machine_mode mode = GET_MODE (ve->reg);
2252 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2253
2254 if (ve->var_expansions.length () == 0)
2255 return;
2256
2257 start_sequence ();
2258 switch (ve->op)
2259 {
2260 case FMA:
2261 /* Note that we only accumulate FMA via the ADD operand. */
2262 case PLUS:
2263 case MINUS:
2264 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2265 {
2266 if (honor_signed_zero_p)
2267 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2268 else
2269 zero_init = CONST0_RTX (mode);
2270 emit_move_insn (var, zero_init);
2271 }
2272 break;
2273
2274 case MULT:
2275 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2276 {
2277 zero_init = CONST1_RTX (GET_MODE (var));
2278 emit_move_insn (var, zero_init);
2279 }
2280 break;
2281
2282 default:
2283 gcc_unreachable ();
2284 }
2285
2286 seq = get_insns ();
2287 end_sequence ();
2288
2289 emit_insn_after (seq, BB_END (place));
2290 }
2291
2292 /* Combine the variable expansions at the loop exit. PLACE is the
2293 loop exit basic block where the summation of the expansions should
2294 take place. */
2295
2296 static void
2297 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2298 {
2299 rtx sum = ve->reg;
2300 rtx expr, seq, var, insn;
2301 unsigned i;
2302
2303 if (ve->var_expansions.length () == 0)
2304 return;
2305
2306 start_sequence ();
2307 switch (ve->op)
2308 {
2309 case FMA:
2310 /* Note that we only accumulate FMA via the ADD operand. */
2311 case PLUS:
2312 case MINUS:
2313 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2314 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2315 break;
2316
2317 case MULT:
2318 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2319 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2320 break;
2321
2322 default:
2323 gcc_unreachable ();
2324 }
2325
2326 expr = force_operand (sum, ve->reg);
2327 if (expr != ve->reg)
2328 emit_move_insn (ve->reg, expr);
2329 seq = get_insns ();
2330 end_sequence ();
2331
2332 insn = BB_HEAD (place);
2333 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2334 insn = NEXT_INSN (insn);
2335
2336 emit_insn_after (seq, insn);
2337 }
2338
2339 /* Strip away REG_EQUAL notes for IVs we're splitting.
2340
2341 Updating REG_EQUAL notes for IVs we split is tricky: We
2342 cannot tell until after unrolling, DF-rescanning, and liveness
2343 updating, whether an EQ_USE is reached by the split IV while
2344 the IV reg is still live. See PR55006.
2345
2346 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2347 because RTL loop-iv requires us to defer rescanning insns and
2348 any notes attached to them. So resort to old techniques... */
2349
2350 static void
2351 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2352 {
2353 struct iv_to_split *ivts;
2354 rtx note = find_reg_equal_equiv_note (insn);
2355 if (! note)
2356 return;
2357 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2358 if (reg_mentioned_p (ivts->orig_var, note))
2359 {
2360 remove_note (insn, note);
2361 return;
2362 }
2363 }
2364
2365 /* Apply loop optimizations in loop copies using the
2366 data which gathered during the unrolling. Structure
2367 OPT_INFO record that data.
2368
2369 UNROLLING is true if we unrolled (not peeled) the loop.
2370 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2371 the loop (as it should happen in complete unrolling, but not in ordinary
2372 peeling of the loop). */
2373
2374 static void
2375 apply_opt_in_copies (struct opt_info *opt_info,
2376 unsigned n_copies, bool unrolling,
2377 bool rewrite_original_loop)
2378 {
2379 unsigned i, delta;
2380 basic_block bb, orig_bb;
2381 rtx insn, orig_insn, next;
2382 struct iv_to_split ivts_templ, *ivts;
2383 struct var_to_expand ve_templ, *ves;
2384
2385 /* Sanity check -- we need to put initialization in the original loop
2386 body. */
2387 gcc_assert (!unrolling || rewrite_original_loop);
2388
2389 /* Allocate the basic variables (i0). */
2390 if (opt_info->insns_to_split)
2391 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2392 allocate_basic_variable (ivts);
2393
2394 for (i = opt_info->first_new_block;
2395 i < (unsigned) last_basic_block_for_fn (cfun);
2396 i++)
2397 {
2398 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2399 orig_bb = get_bb_original (bb);
2400
2401 /* bb->aux holds position in copy sequence initialized by
2402 duplicate_loop_to_header_edge. */
2403 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2404 unrolling);
2405 bb->aux = 0;
2406 orig_insn = BB_HEAD (orig_bb);
2407 FOR_BB_INSNS_SAFE (bb, insn, next)
2408 {
2409 if (!INSN_P (insn)
2410 || (DEBUG_INSN_P (insn)
2411 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2412 continue;
2413
2414 while (!INSN_P (orig_insn)
2415 || (DEBUG_INSN_P (orig_insn)
2416 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2417 == LABEL_DECL)))
2418 orig_insn = NEXT_INSN (orig_insn);
2419
2420 ivts_templ.insn = orig_insn;
2421 ve_templ.insn = orig_insn;
2422
2423 /* Apply splitting iv optimization. */
2424 if (opt_info->insns_to_split)
2425 {
2426 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2427
2428 ivts = opt_info->insns_to_split->find (&ivts_templ);
2429
2430 if (ivts)
2431 {
2432 gcc_assert (GET_CODE (PATTERN (insn))
2433 == GET_CODE (PATTERN (orig_insn)));
2434
2435 if (!delta)
2436 insert_base_initialization (ivts, insn);
2437 split_iv (ivts, insn, delta);
2438 }
2439 }
2440 /* Apply variable expansion optimization. */
2441 if (unrolling && opt_info->insns_with_var_to_expand)
2442 {
2443 ves = (struct var_to_expand *)
2444 opt_info->insns_with_var_to_expand->find (&ve_templ);
2445 if (ves)
2446 {
2447 gcc_assert (GET_CODE (PATTERN (insn))
2448 == GET_CODE (PATTERN (orig_insn)));
2449 expand_var_during_unrolling (ves, insn);
2450 }
2451 }
2452 orig_insn = NEXT_INSN (orig_insn);
2453 }
2454 }
2455
2456 if (!rewrite_original_loop)
2457 return;
2458
2459 /* Initialize the variable expansions in the loop preheader
2460 and take care of combining them at the loop exit. */
2461 if (opt_info->insns_with_var_to_expand)
2462 {
2463 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2464 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2465 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2466 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2467 }
2468
2469 /* Rewrite also the original loop body. Find them as originals of the blocks
2470 in the last copied iteration, i.e. those that have
2471 get_bb_copy (get_bb_original (bb)) == bb. */
2472 for (i = opt_info->first_new_block;
2473 i < (unsigned) last_basic_block_for_fn (cfun);
2474 i++)
2475 {
2476 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2477 orig_bb = get_bb_original (bb);
2478 if (get_bb_copy (orig_bb) != bb)
2479 continue;
2480
2481 delta = determine_split_iv_delta (0, n_copies, unrolling);
2482 for (orig_insn = BB_HEAD (orig_bb);
2483 orig_insn != NEXT_INSN (BB_END (bb));
2484 orig_insn = next)
2485 {
2486 next = NEXT_INSN (orig_insn);
2487
2488 if (!INSN_P (orig_insn))
2489 continue;
2490
2491 ivts_templ.insn = orig_insn;
2492 if (opt_info->insns_to_split)
2493 {
2494 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2495
2496 ivts = (struct iv_to_split *)
2497 opt_info->insns_to_split->find (&ivts_templ);
2498 if (ivts)
2499 {
2500 if (!delta)
2501 insert_base_initialization (ivts, orig_insn);
2502 split_iv (ivts, orig_insn, delta);
2503 continue;
2504 }
2505 }
2506
2507 }
2508 }
2509 }
2510
2511 /* Release OPT_INFO. */
2512
2513 static void
2514 free_opt_info (struct opt_info *opt_info)
2515 {
2516 delete opt_info->insns_to_split;
2517 opt_info->insns_to_split = NULL;
2518 if (opt_info->insns_with_var_to_expand)
2519 {
2520 struct var_to_expand *ves;
2521
2522 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2523 ves->var_expansions.release ();
2524 delete opt_info->insns_with_var_to_expand;
2525 opt_info->insns_with_var_to_expand = NULL;
2526 }
2527 free (opt_info);
2528 }