[ARM] Add ACLE 2.0 predefined marco __ARM_FEATURE_IDIV
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "hard-reg-set.h"
27 #include "obstack.h"
28 #include "profile.h"
29 #include "basic-block.h"
30 #include "cfgloop.h"
31 #include "params.h"
32 #include "expr.h"
33 #include "hash-table.h"
34 #include "recog.h"
35 #include "target.h"
36 #include "dumpfile.h"
37
38 /* This pass performs loop unrolling. We only perform this
39 optimization on innermost loops (with single exception) because
40 the impact on performance is greatest here, and we want to avoid
41 unnecessary code size growth. The gain is caused by greater sequentiality
42 of code, better code to optimize for further passes and in some cases
43 by fewer testings of exit conditions. The main problem is code growth,
44 that impacts performance negatively due to effect of caches.
45
46 What we do:
47
48 -- unrolling of loops that roll constant times; this is almost always
49 win, as we get rid of exit condition tests.
50 -- unrolling of loops that roll number of times that we can compute
51 in runtime; we also get rid of exit condition tests here, but there
52 is the extra expense for calculating the number of iterations
53 -- simple unrolling of remaining loops; this is performed only if we
54 are asked to, as the gain is questionable in this case and often
55 it may even slow down the code
56 For more detailed descriptions of each of those, see comments at
57 appropriate function below.
58
59 There is a lot of parameters (defined and described in params.def) that
60 control how much we unroll.
61
62 ??? A great problem is that we don't have a good way how to determine
63 how many times we should unroll the loop; the experiments I have made
64 showed that this choice may affect performance in order of several %.
65 */
66
67 /* Information about induction variables to split. */
68
69 struct iv_to_split
70 {
71 rtx_insn *insn; /* The insn in that the induction variable occurs. */
72 rtx orig_var; /* The variable (register) for the IV before split. */
73 rtx base_var; /* The variable on that the values in the further
74 iterations are based. */
75 rtx step; /* Step of the induction variable. */
76 struct iv_to_split *next; /* Next entry in walking order. */
77 };
78
79 /* Information about accumulators to expand. */
80
81 struct var_to_expand
82 {
83 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
84 rtx reg; /* The accumulator which is expanded. */
85 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
86 struct var_to_expand *next; /* Next entry in walking order. */
87 enum rtx_code op; /* The type of the accumulation - addition, subtraction
88 or multiplication. */
89 int expansion_count; /* Count the number of expansions generated so far. */
90 int reuse_expansion; /* The expansion we intend to reuse to expand
91 the accumulator. If REUSE_EXPANSION is 0 reuse
92 the original accumulator. Else use
93 var_expansions[REUSE_EXPANSION - 1]. */
94 };
95
96 /* Hashtable helper for iv_to_split. */
97
98 struct iv_split_hasher : typed_free_remove <iv_to_split>
99 {
100 typedef iv_to_split value_type;
101 typedef iv_to_split compare_type;
102 static inline hashval_t hash (const value_type *);
103 static inline bool equal (const value_type *, const compare_type *);
104 };
105
106
107 /* A hash function for information about insns to split. */
108
109 inline hashval_t
110 iv_split_hasher::hash (const value_type *ivts)
111 {
112 return (hashval_t) INSN_UID (ivts->insn);
113 }
114
115 /* An equality functions for information about insns to split. */
116
117 inline bool
118 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
119 {
120 return i1->insn == i2->insn;
121 }
122
123 /* Hashtable helper for iv_to_split. */
124
125 struct var_expand_hasher : typed_free_remove <var_to_expand>
126 {
127 typedef var_to_expand value_type;
128 typedef var_to_expand compare_type;
129 static inline hashval_t hash (const value_type *);
130 static inline bool equal (const value_type *, const compare_type *);
131 };
132
133 /* Return a hash for VES. */
134
135 inline hashval_t
136 var_expand_hasher::hash (const value_type *ves)
137 {
138 return (hashval_t) INSN_UID (ves->insn);
139 }
140
141 /* Return true if I1 and I2 refer to the same instruction. */
142
143 inline bool
144 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
145 {
146 return i1->insn == i2->insn;
147 }
148
149 /* Information about optimization applied in
150 the unrolled loop. */
151
152 struct opt_info
153 {
154 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
155 split. */
156 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
157 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
158 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
159 insns with accumulators to expand. */
160 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
161 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
162 unsigned first_new_block; /* The first basic block that was
163 duplicated. */
164 basic_block loop_exit; /* The loop exit basic block. */
165 basic_block loop_preheader; /* The loop preheader basic block. */
166 };
167
168 static void decide_unroll_stupid (struct loop *, int);
169 static void decide_unroll_constant_iterations (struct loop *, int);
170 static void decide_unroll_runtime_iterations (struct loop *, int);
171 static void unroll_loop_stupid (struct loop *);
172 static void decide_unrolling (int);
173 static void unroll_loop_constant_iterations (struct loop *);
174 static void unroll_loop_runtime_iterations (struct loop *);
175 static struct opt_info *analyze_insns_in_loop (struct loop *);
176 static void opt_info_start_duplication (struct opt_info *);
177 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
178 static void free_opt_info (struct opt_info *);
179 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
180 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
181 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
182 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
183 static void insert_var_expansion_initialization (struct var_to_expand *,
184 basic_block);
185 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
186 basic_block);
187 static rtx get_expansion (struct var_to_expand *);
188
189 /* Emit a message summarizing the unroll that will be
190 performed for LOOP, along with the loop's location LOCUS, if
191 appropriate given the dump or -fopt-info settings. */
192
193 static void
194 report_unroll (struct loop *loop, location_t locus)
195 {
196 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
197
198 if (loop->lpt_decision.decision == LPT_NONE)
199 return;
200
201 if (!dump_enabled_p ())
202 return;
203
204 dump_printf_loc (report_flags, locus,
205 "loop unrolled %d times",
206 loop->lpt_decision.times);
207 if (profile_info)
208 dump_printf (report_flags,
209 " (header execution count %d)",
210 (int)loop->header->count);
211
212 dump_printf (report_flags, "\n");
213 }
214
215 /* Decide whether unroll loops and how much. */
216 static void
217 decide_unrolling (int flags)
218 {
219 struct loop *loop;
220
221 /* Scan the loops, inner ones first. */
222 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
223 {
224 loop->lpt_decision.decision = LPT_NONE;
225 location_t locus = get_loop_location (loop);
226
227 if (dump_enabled_p ())
228 dump_printf_loc (TDF_RTL, locus,
229 ";; *** Considering loop %d at BB %d for "
230 "unrolling ***\n",
231 loop->num, loop->header->index);
232
233 /* Do not peel cold areas. */
234 if (optimize_loop_for_size_p (loop))
235 {
236 if (dump_file)
237 fprintf (dump_file, ";; Not considering loop, cold area\n");
238 continue;
239 }
240
241 /* Can the loop be manipulated? */
242 if (!can_duplicate_loop_p (loop))
243 {
244 if (dump_file)
245 fprintf (dump_file,
246 ";; Not considering loop, cannot duplicate\n");
247 continue;
248 }
249
250 /* Skip non-innermost loops. */
251 if (loop->inner)
252 {
253 if (dump_file)
254 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
255 continue;
256 }
257
258 loop->ninsns = num_loop_insns (loop);
259 loop->av_ninsns = average_num_loop_insns (loop);
260
261 /* Try transformations one by one in decreasing order of
262 priority. */
263
264 decide_unroll_constant_iterations (loop, flags);
265 if (loop->lpt_decision.decision == LPT_NONE)
266 decide_unroll_runtime_iterations (loop, flags);
267 if (loop->lpt_decision.decision == LPT_NONE)
268 decide_unroll_stupid (loop, flags);
269
270 report_unroll (loop, locus);
271 }
272 }
273
274 /* Unroll LOOPS. */
275 void
276 unroll_loops (int flags)
277 {
278 struct loop *loop;
279 bool changed = false;
280
281 /* Now decide rest of unrolling. */
282 decide_unrolling (flags);
283
284 /* Scan the loops, inner ones first. */
285 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
286 {
287 /* And perform the appropriate transformations. */
288 switch (loop->lpt_decision.decision)
289 {
290 case LPT_UNROLL_CONSTANT:
291 unroll_loop_constant_iterations (loop);
292 changed = true;
293 break;
294 case LPT_UNROLL_RUNTIME:
295 unroll_loop_runtime_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_STUPID:
299 unroll_loop_stupid (loop);
300 changed = true;
301 break;
302 case LPT_NONE:
303 break;
304 default:
305 gcc_unreachable ();
306 }
307 }
308
309 if (changed)
310 {
311 calculate_dominance_info (CDI_DOMINATORS);
312 fix_loop_structure (NULL);
313 }
314
315 iv_analysis_done ();
316 }
317
318 /* Check whether exit of the LOOP is at the end of loop body. */
319
320 static bool
321 loop_exit_at_end_p (struct loop *loop)
322 {
323 struct niter_desc *desc = get_simple_loop_desc (loop);
324 rtx_insn *insn;
325
326 /* We should never have conditional in latch block. */
327 gcc_assert (desc->in_edge->dest != loop->header);
328
329 if (desc->in_edge->dest != loop->latch)
330 return false;
331
332 /* Check that the latch is empty. */
333 FOR_BB_INSNS (loop->latch, insn)
334 {
335 if (INSN_P (insn) && active_insn_p (insn))
336 return false;
337 }
338
339 return true;
340 }
341
342 /* Decide whether to unroll LOOP iterating constant number of times
343 and how much. */
344
345 static void
346 decide_unroll_constant_iterations (struct loop *loop, int flags)
347 {
348 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
349 struct niter_desc *desc;
350 widest_int iterations;
351
352 if (!(flags & UAP_UNROLL))
353 {
354 /* We were not asked to, just return back silently. */
355 return;
356 }
357
358 if (dump_file)
359 fprintf (dump_file,
360 "\n;; Considering unrolling loop with constant "
361 "number of iterations\n");
362
363 /* nunroll = total number of copies of the original loop body in
364 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
365 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
366 nunroll_by_av
367 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
368 if (nunroll > nunroll_by_av)
369 nunroll = nunroll_by_av;
370 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
371 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
372
373 if (targetm.loop_unroll_adjust)
374 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
375
376 /* Skip big loops. */
377 if (nunroll <= 1)
378 {
379 if (dump_file)
380 fprintf (dump_file, ";; Not considering loop, is too big\n");
381 return;
382 }
383
384 /* Check for simple loops. */
385 desc = get_simple_loop_desc (loop);
386
387 /* Check number of iterations. */
388 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
389 {
390 if (dump_file)
391 fprintf (dump_file,
392 ";; Unable to prove that the loop iterates constant times\n");
393 return;
394 }
395
396 /* Check whether the loop rolls enough to consider.
397 Consult also loop bounds and profile; in the case the loop has more
398 than one exit it may well loop less than determined maximal number
399 of iterations. */
400 if (desc->niter < 2 * nunroll
401 || ((get_estimated_loop_iterations (loop, &iterations)
402 || get_max_loop_iterations (loop, &iterations))
403 && wi::ltu_p (iterations, 2 * nunroll)))
404 {
405 if (dump_file)
406 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
407 return;
408 }
409
410 /* Success; now compute number of iterations to unroll. We alter
411 nunroll so that as few as possible copies of loop body are
412 necessary, while still not decreasing the number of unrollings
413 too much (at most by 1). */
414 best_copies = 2 * nunroll + 10;
415
416 i = 2 * nunroll + 2;
417 if (i - 1 >= desc->niter)
418 i = desc->niter - 2;
419
420 for (; i >= nunroll - 1; i--)
421 {
422 unsigned exit_mod = desc->niter % (i + 1);
423
424 if (!loop_exit_at_end_p (loop))
425 n_copies = exit_mod + i + 1;
426 else if (exit_mod != (unsigned) i
427 || desc->noloop_assumptions != NULL_RTX)
428 n_copies = exit_mod + i + 2;
429 else
430 n_copies = i + 1;
431
432 if (n_copies < best_copies)
433 {
434 best_copies = n_copies;
435 best_unroll = i;
436 }
437 }
438
439 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
440 loop->lpt_decision.times = best_unroll;
441 }
442
443 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
444 The transformation does this:
445
446 for (i = 0; i < 102; i++)
447 body;
448
449 ==> (LOOP->LPT_DECISION.TIMES == 3)
450
451 i = 0;
452 body; i++;
453 body; i++;
454 while (i < 102)
455 {
456 body; i++;
457 body; i++;
458 body; i++;
459 body; i++;
460 }
461 */
462 static void
463 unroll_loop_constant_iterations (struct loop *loop)
464 {
465 unsigned HOST_WIDE_INT niter;
466 unsigned exit_mod;
467 sbitmap wont_exit;
468 unsigned i;
469 edge e;
470 unsigned max_unroll = loop->lpt_decision.times;
471 struct niter_desc *desc = get_simple_loop_desc (loop);
472 bool exit_at_end = loop_exit_at_end_p (loop);
473 struct opt_info *opt_info = NULL;
474 bool ok;
475
476 niter = desc->niter;
477
478 /* Should not get here (such loop should be peeled instead). */
479 gcc_assert (niter > max_unroll + 1);
480
481 exit_mod = niter % (max_unroll + 1);
482
483 wont_exit = sbitmap_alloc (max_unroll + 1);
484 bitmap_ones (wont_exit);
485
486 auto_vec<edge> remove_edges;
487 if (flag_split_ivs_in_unroller
488 || flag_variable_expansion_in_unroller)
489 opt_info = analyze_insns_in_loop (loop);
490
491 if (!exit_at_end)
492 {
493 /* The exit is not at the end of the loop; leave exit test
494 in the first copy, so that the loops that start with test
495 of exit condition have continuous body after unrolling. */
496
497 if (dump_file)
498 fprintf (dump_file, ";; Condition at beginning of loop.\n");
499
500 /* Peel exit_mod iterations. */
501 bitmap_clear_bit (wont_exit, 0);
502 if (desc->noloop_assumptions)
503 bitmap_clear_bit (wont_exit, 1);
504
505 if (exit_mod)
506 {
507 opt_info_start_duplication (opt_info);
508 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
509 exit_mod,
510 wont_exit, desc->out_edge,
511 &remove_edges,
512 DLTHE_FLAG_UPDATE_FREQ
513 | (opt_info && exit_mod > 1
514 ? DLTHE_RECORD_COPY_NUMBER
515 : 0));
516 gcc_assert (ok);
517
518 if (opt_info && exit_mod > 1)
519 apply_opt_in_copies (opt_info, exit_mod, false, false);
520
521 desc->noloop_assumptions = NULL_RTX;
522 desc->niter -= exit_mod;
523 loop->nb_iterations_upper_bound -= exit_mod;
524 if (loop->any_estimate
525 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
526 loop->nb_iterations_estimate -= exit_mod;
527 else
528 loop->any_estimate = false;
529 }
530
531 bitmap_set_bit (wont_exit, 1);
532 }
533 else
534 {
535 /* Leave exit test in last copy, for the same reason as above if
536 the loop tests the condition at the end of loop body. */
537
538 if (dump_file)
539 fprintf (dump_file, ";; Condition at end of loop.\n");
540
541 /* We know that niter >= max_unroll + 2; so we do not need to care of
542 case when we would exit before reaching the loop. So just peel
543 exit_mod + 1 iterations. */
544 if (exit_mod != max_unroll
545 || desc->noloop_assumptions)
546 {
547 bitmap_clear_bit (wont_exit, 0);
548 if (desc->noloop_assumptions)
549 bitmap_clear_bit (wont_exit, 1);
550
551 opt_info_start_duplication (opt_info);
552 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
553 exit_mod + 1,
554 wont_exit, desc->out_edge,
555 &remove_edges,
556 DLTHE_FLAG_UPDATE_FREQ
557 | (opt_info && exit_mod > 0
558 ? DLTHE_RECORD_COPY_NUMBER
559 : 0));
560 gcc_assert (ok);
561
562 if (opt_info && exit_mod > 0)
563 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
564
565 desc->niter -= exit_mod + 1;
566 loop->nb_iterations_upper_bound -= exit_mod + 1;
567 if (loop->any_estimate
568 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
569 loop->nb_iterations_estimate -= exit_mod + 1;
570 else
571 loop->any_estimate = false;
572 desc->noloop_assumptions = NULL_RTX;
573
574 bitmap_set_bit (wont_exit, 0);
575 bitmap_set_bit (wont_exit, 1);
576 }
577
578 bitmap_clear_bit (wont_exit, max_unroll);
579 }
580
581 /* Now unroll the loop. */
582
583 opt_info_start_duplication (opt_info);
584 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
585 max_unroll,
586 wont_exit, desc->out_edge,
587 &remove_edges,
588 DLTHE_FLAG_UPDATE_FREQ
589 | (opt_info
590 ? DLTHE_RECORD_COPY_NUMBER
591 : 0));
592 gcc_assert (ok);
593
594 if (opt_info)
595 {
596 apply_opt_in_copies (opt_info, max_unroll, true, true);
597 free_opt_info (opt_info);
598 }
599
600 free (wont_exit);
601
602 if (exit_at_end)
603 {
604 basic_block exit_block = get_bb_copy (desc->in_edge->src);
605 /* Find a new in and out edge; they are in the last copy we have made. */
606
607 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
608 {
609 desc->out_edge = EDGE_SUCC (exit_block, 0);
610 desc->in_edge = EDGE_SUCC (exit_block, 1);
611 }
612 else
613 {
614 desc->out_edge = EDGE_SUCC (exit_block, 1);
615 desc->in_edge = EDGE_SUCC (exit_block, 0);
616 }
617 }
618
619 desc->niter /= max_unroll + 1;
620 loop->nb_iterations_upper_bound
621 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
622 if (loop->any_estimate)
623 loop->nb_iterations_estimate
624 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
625 desc->niter_expr = GEN_INT (desc->niter);
626
627 /* Remove the edges. */
628 FOR_EACH_VEC_ELT (remove_edges, i, e)
629 remove_path (e);
630
631 if (dump_file)
632 fprintf (dump_file,
633 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
634 max_unroll, num_loop_insns (loop));
635 }
636
637 /* Decide whether to unroll LOOP iterating runtime computable number of times
638 and how much. */
639 static void
640 decide_unroll_runtime_iterations (struct loop *loop, int flags)
641 {
642 unsigned nunroll, nunroll_by_av, i;
643 struct niter_desc *desc;
644 widest_int iterations;
645
646 if (!(flags & UAP_UNROLL))
647 {
648 /* We were not asked to, just return back silently. */
649 return;
650 }
651
652 if (dump_file)
653 fprintf (dump_file,
654 "\n;; Considering unrolling loop with runtime "
655 "computable number of iterations\n");
656
657 /* nunroll = total number of copies of the original loop body in
658 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
659 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
660 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
661 if (nunroll > nunroll_by_av)
662 nunroll = nunroll_by_av;
663 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
664 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
665
666 if (targetm.loop_unroll_adjust)
667 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
668
669 /* Skip big loops. */
670 if (nunroll <= 1)
671 {
672 if (dump_file)
673 fprintf (dump_file, ";; Not considering loop, is too big\n");
674 return;
675 }
676
677 /* Check for simple loops. */
678 desc = get_simple_loop_desc (loop);
679
680 /* Check simpleness. */
681 if (!desc->simple_p || desc->assumptions)
682 {
683 if (dump_file)
684 fprintf (dump_file,
685 ";; Unable to prove that the number of iterations "
686 "can be counted in runtime\n");
687 return;
688 }
689
690 if (desc->const_iter)
691 {
692 if (dump_file)
693 fprintf (dump_file, ";; Loop iterates constant times\n");
694 return;
695 }
696
697 /* Check whether the loop rolls. */
698 if ((get_estimated_loop_iterations (loop, &iterations)
699 || get_max_loop_iterations (loop, &iterations))
700 && wi::ltu_p (iterations, 2 * nunroll))
701 {
702 if (dump_file)
703 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
704 return;
705 }
706
707 /* Success; now force nunroll to be power of 2, as we are unable to
708 cope with overflows in computation of number of iterations. */
709 for (i = 1; 2 * i <= nunroll; i *= 2)
710 continue;
711
712 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
713 loop->lpt_decision.times = i - 1;
714 }
715
716 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
717 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
718 and NULL is returned instead. */
719
720 basic_block
721 split_edge_and_insert (edge e, rtx_insn *insns)
722 {
723 basic_block bb;
724
725 if (!insns)
726 return NULL;
727 bb = split_edge (e);
728 emit_insn_after (insns, BB_END (bb));
729
730 /* ??? We used to assume that INSNS can contain control flow insns, and
731 that we had to try to find sub basic blocks in BB to maintain a valid
732 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
733 and call break_superblocks when going out of cfglayout mode. But it
734 turns out that this never happens; and that if it does ever happen,
735 the verify_flow_info at the end of the RTL loop passes would fail.
736
737 There are two reasons why we expected we could have control flow insns
738 in INSNS. The first is when a comparison has to be done in parts, and
739 the second is when the number of iterations is computed for loops with
740 the number of iterations known at runtime. In both cases, test cases
741 to get control flow in INSNS appear to be impossible to construct:
742
743 * If do_compare_rtx_and_jump needs several branches to do comparison
744 in a mode that needs comparison by parts, we cannot analyze the
745 number of iterations of the loop, and we never get to unrolling it.
746
747 * The code in expand_divmod that was suspected to cause creation of
748 branching code seems to be only accessed for signed division. The
749 divisions used by # of iterations analysis are always unsigned.
750 Problems might arise on architectures that emits branching code
751 for some operations that may appear in the unroller (especially
752 for division), but we have no such architectures.
753
754 Considering all this, it was decided that we should for now assume
755 that INSNS can in theory contain control flow insns, but in practice
756 it never does. So we don't handle the theoretical case, and should
757 a real failure ever show up, we have a pretty good clue for how to
758 fix it. */
759
760 return bb;
761 }
762
763 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
764 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
765 in order to create a jump. */
766
767 static rtx_insn *
768 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob,
769 rtx_insn *cinsn)
770 {
771 rtx_insn *seq, *jump;
772 rtx cond;
773 enum machine_mode mode;
774
775 mode = GET_MODE (op0);
776 if (mode == VOIDmode)
777 mode = GET_MODE (op1);
778
779 start_sequence ();
780 if (GET_MODE_CLASS (mode) == MODE_CC)
781 {
782 /* A hack -- there seems to be no easy generic way how to make a
783 conditional jump from a ccmode comparison. */
784 gcc_assert (cinsn);
785 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
786 gcc_assert (GET_CODE (cond) == comp);
787 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
788 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
789 emit_jump_insn (copy_insn (PATTERN (cinsn)));
790 jump = get_last_insn ();
791 gcc_assert (JUMP_P (jump));
792 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
793 LABEL_NUSES (JUMP_LABEL (jump))++;
794 redirect_jump (jump, label, 0);
795 }
796 else
797 {
798 gcc_assert (!cinsn);
799
800 op0 = force_operand (op0, NULL_RTX);
801 op1 = force_operand (op1, NULL_RTX);
802 do_compare_rtx_and_jump (op0, op1, comp, 0,
803 mode, NULL_RTX, NULL_RTX, label, -1);
804 jump = get_last_insn ();
805 gcc_assert (JUMP_P (jump));
806 JUMP_LABEL (jump) = label;
807 LABEL_NUSES (label)++;
808 }
809 add_int_reg_note (jump, REG_BR_PROB, prob);
810
811 seq = get_insns ();
812 end_sequence ();
813
814 return seq;
815 }
816
817 /* Unroll LOOP for which we are able to count number of iterations in runtime
818 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
819 extra care for case n < 0):
820
821 for (i = 0; i < n; i++)
822 body;
823
824 ==> (LOOP->LPT_DECISION.TIMES == 3)
825
826 i = 0;
827 mod = n % 4;
828
829 switch (mod)
830 {
831 case 3:
832 body; i++;
833 case 2:
834 body; i++;
835 case 1:
836 body; i++;
837 case 0: ;
838 }
839
840 while (i < n)
841 {
842 body; i++;
843 body; i++;
844 body; i++;
845 body; i++;
846 }
847 */
848 static void
849 unroll_loop_runtime_iterations (struct loop *loop)
850 {
851 rtx old_niter, niter, tmp;
852 rtx_insn *init_code, *branch_code;
853 unsigned i, j, p;
854 basic_block preheader, *body, swtch, ezc_swtch;
855 sbitmap wont_exit;
856 int may_exit_copy;
857 unsigned n_peel;
858 edge e;
859 bool extra_zero_check, last_may_exit;
860 unsigned max_unroll = loop->lpt_decision.times;
861 struct niter_desc *desc = get_simple_loop_desc (loop);
862 bool exit_at_end = loop_exit_at_end_p (loop);
863 struct opt_info *opt_info = NULL;
864 bool ok;
865
866 if (flag_split_ivs_in_unroller
867 || flag_variable_expansion_in_unroller)
868 opt_info = analyze_insns_in_loop (loop);
869
870 /* Remember blocks whose dominators will have to be updated. */
871 auto_vec<basic_block> dom_bbs;
872
873 body = get_loop_body (loop);
874 for (i = 0; i < loop->num_nodes; i++)
875 {
876 vec<basic_block> ldom;
877 basic_block bb;
878
879 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
880 FOR_EACH_VEC_ELT (ldom, j, bb)
881 if (!flow_bb_inside_loop_p (loop, bb))
882 dom_bbs.safe_push (bb);
883
884 ldom.release ();
885 }
886 free (body);
887
888 if (!exit_at_end)
889 {
890 /* Leave exit in first copy (for explanation why see comment in
891 unroll_loop_constant_iterations). */
892 may_exit_copy = 0;
893 n_peel = max_unroll - 1;
894 extra_zero_check = true;
895 last_may_exit = false;
896 }
897 else
898 {
899 /* Leave exit in last copy (for explanation why see comment in
900 unroll_loop_constant_iterations). */
901 may_exit_copy = max_unroll;
902 n_peel = max_unroll;
903 extra_zero_check = false;
904 last_may_exit = true;
905 }
906
907 /* Get expression for number of iterations. */
908 start_sequence ();
909 old_niter = niter = gen_reg_rtx (desc->mode);
910 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
911 if (tmp != niter)
912 emit_move_insn (niter, tmp);
913
914 /* Count modulo by ANDing it with max_unroll; we use the fact that
915 the number of unrollings is a power of two, and thus this is correct
916 even if there is overflow in the computation. */
917 niter = expand_simple_binop (desc->mode, AND,
918 niter, gen_int_mode (max_unroll, desc->mode),
919 NULL_RTX, 0, OPTAB_LIB_WIDEN);
920
921 init_code = get_insns ();
922 end_sequence ();
923 unshare_all_rtl_in_chain (init_code);
924
925 /* Precondition the loop. */
926 split_edge_and_insert (loop_preheader_edge (loop), init_code);
927
928 auto_vec<edge> remove_edges;
929
930 wont_exit = sbitmap_alloc (max_unroll + 2);
931
932 /* Peel the first copy of loop body (almost always we must leave exit test
933 here; the only exception is when we have extra zero check and the number
934 of iterations is reliable. Also record the place of (possible) extra
935 zero check. */
936 bitmap_clear (wont_exit);
937 if (extra_zero_check
938 && !desc->noloop_assumptions)
939 bitmap_set_bit (wont_exit, 1);
940 ezc_swtch = loop_preheader_edge (loop)->src;
941 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
942 1, wont_exit, desc->out_edge,
943 &remove_edges,
944 DLTHE_FLAG_UPDATE_FREQ);
945 gcc_assert (ok);
946
947 /* Record the place where switch will be built for preconditioning. */
948 swtch = split_edge (loop_preheader_edge (loop));
949
950 for (i = 0; i < n_peel; i++)
951 {
952 /* Peel the copy. */
953 bitmap_clear (wont_exit);
954 if (i != n_peel - 1 || !last_may_exit)
955 bitmap_set_bit (wont_exit, 1);
956 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
957 1, wont_exit, desc->out_edge,
958 &remove_edges,
959 DLTHE_FLAG_UPDATE_FREQ);
960 gcc_assert (ok);
961
962 /* Create item for switch. */
963 j = n_peel - i - (extra_zero_check ? 0 : 1);
964 p = REG_BR_PROB_BASE / (i + 2);
965
966 preheader = split_edge (loop_preheader_edge (loop));
967 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
968 block_label (preheader), p,
969 NULL);
970
971 /* We rely on the fact that the compare and jump cannot be optimized out,
972 and hence the cfg we create is correct. */
973 gcc_assert (branch_code != NULL_RTX);
974
975 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
976 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
977 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
978 e = make_edge (swtch, preheader,
979 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
980 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
981 e->probability = p;
982 }
983
984 if (extra_zero_check)
985 {
986 /* Add branch for zero iterations. */
987 p = REG_BR_PROB_BASE / (max_unroll + 1);
988 swtch = ezc_swtch;
989 preheader = split_edge (loop_preheader_edge (loop));
990 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
991 block_label (preheader), p,
992 NULL);
993 gcc_assert (branch_code != NULL_RTX);
994
995 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
996 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
997 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
998 e = make_edge (swtch, preheader,
999 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1000 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1001 e->probability = p;
1002 }
1003
1004 /* Recount dominators for outer blocks. */
1005 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1006
1007 /* And unroll loop. */
1008
1009 bitmap_ones (wont_exit);
1010 bitmap_clear_bit (wont_exit, may_exit_copy);
1011 opt_info_start_duplication (opt_info);
1012
1013 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1014 max_unroll,
1015 wont_exit, desc->out_edge,
1016 &remove_edges,
1017 DLTHE_FLAG_UPDATE_FREQ
1018 | (opt_info
1019 ? DLTHE_RECORD_COPY_NUMBER
1020 : 0));
1021 gcc_assert (ok);
1022
1023 if (opt_info)
1024 {
1025 apply_opt_in_copies (opt_info, max_unroll, true, true);
1026 free_opt_info (opt_info);
1027 }
1028
1029 free (wont_exit);
1030
1031 if (exit_at_end)
1032 {
1033 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1034 /* Find a new in and out edge; they are in the last copy we have
1035 made. */
1036
1037 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1038 {
1039 desc->out_edge = EDGE_SUCC (exit_block, 0);
1040 desc->in_edge = EDGE_SUCC (exit_block, 1);
1041 }
1042 else
1043 {
1044 desc->out_edge = EDGE_SUCC (exit_block, 1);
1045 desc->in_edge = EDGE_SUCC (exit_block, 0);
1046 }
1047 }
1048
1049 /* Remove the edges. */
1050 FOR_EACH_VEC_ELT (remove_edges, i, e)
1051 remove_path (e);
1052
1053 /* We must be careful when updating the number of iterations due to
1054 preconditioning and the fact that the value must be valid at entry
1055 of the loop. After passing through the above code, we see that
1056 the correct new number of iterations is this: */
1057 gcc_assert (!desc->const_iter);
1058 desc->niter_expr =
1059 simplify_gen_binary (UDIV, desc->mode, old_niter,
1060 gen_int_mode (max_unroll + 1, desc->mode));
1061 loop->nb_iterations_upper_bound
1062 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1063 if (loop->any_estimate)
1064 loop->nb_iterations_estimate
1065 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1066 if (exit_at_end)
1067 {
1068 desc->niter_expr =
1069 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1070 desc->noloop_assumptions = NULL_RTX;
1071 --loop->nb_iterations_upper_bound;
1072 if (loop->any_estimate
1073 && loop->nb_iterations_estimate != 0)
1074 --loop->nb_iterations_estimate;
1075 else
1076 loop->any_estimate = false;
1077 }
1078
1079 if (dump_file)
1080 fprintf (dump_file,
1081 ";; Unrolled loop %d times, counting # of iterations "
1082 "in runtime, %i insns\n",
1083 max_unroll, num_loop_insns (loop));
1084 }
1085
1086 /* Decide whether to unroll LOOP stupidly and how much. */
1087 static void
1088 decide_unroll_stupid (struct loop *loop, int flags)
1089 {
1090 unsigned nunroll, nunroll_by_av, i;
1091 struct niter_desc *desc;
1092 widest_int iterations;
1093
1094 if (!(flags & UAP_UNROLL_ALL))
1095 {
1096 /* We were not asked to, just return back silently. */
1097 return;
1098 }
1099
1100 if (dump_file)
1101 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1102
1103 /* nunroll = total number of copies of the original loop body in
1104 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1105 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1106 nunroll_by_av
1107 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1108 if (nunroll > nunroll_by_av)
1109 nunroll = nunroll_by_av;
1110 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1111 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1112
1113 if (targetm.loop_unroll_adjust)
1114 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1115
1116 /* Skip big loops. */
1117 if (nunroll <= 1)
1118 {
1119 if (dump_file)
1120 fprintf (dump_file, ";; Not considering loop, is too big\n");
1121 return;
1122 }
1123
1124 /* Check for simple loops. */
1125 desc = get_simple_loop_desc (loop);
1126
1127 /* Check simpleness. */
1128 if (desc->simple_p && !desc->assumptions)
1129 {
1130 if (dump_file)
1131 fprintf (dump_file, ";; The loop is simple\n");
1132 return;
1133 }
1134
1135 /* Do not unroll loops with branches inside -- it increases number
1136 of mispredicts.
1137 TODO: this heuristic needs tunning; call inside the loop body
1138 is also relatively good reason to not unroll. */
1139 if (num_loop_branches (loop) > 1)
1140 {
1141 if (dump_file)
1142 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1143 return;
1144 }
1145
1146 /* Check whether the loop rolls. */
1147 if ((get_estimated_loop_iterations (loop, &iterations)
1148 || get_max_loop_iterations (loop, &iterations))
1149 && wi::ltu_p (iterations, 2 * nunroll))
1150 {
1151 if (dump_file)
1152 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1153 return;
1154 }
1155
1156 /* Success. Now force nunroll to be power of 2, as it seems that this
1157 improves results (partially because of better alignments, partially
1158 because of some dark magic). */
1159 for (i = 1; 2 * i <= nunroll; i *= 2)
1160 continue;
1161
1162 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1163 loop->lpt_decision.times = i - 1;
1164 }
1165
1166 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1167
1168 while (cond)
1169 body;
1170
1171 ==> (LOOP->LPT_DECISION.TIMES == 3)
1172
1173 while (cond)
1174 {
1175 body;
1176 if (!cond) break;
1177 body;
1178 if (!cond) break;
1179 body;
1180 if (!cond) break;
1181 body;
1182 }
1183 */
1184 static void
1185 unroll_loop_stupid (struct loop *loop)
1186 {
1187 sbitmap wont_exit;
1188 unsigned nunroll = loop->lpt_decision.times;
1189 struct niter_desc *desc = get_simple_loop_desc (loop);
1190 struct opt_info *opt_info = NULL;
1191 bool ok;
1192
1193 if (flag_split_ivs_in_unroller
1194 || flag_variable_expansion_in_unroller)
1195 opt_info = analyze_insns_in_loop (loop);
1196
1197
1198 wont_exit = sbitmap_alloc (nunroll + 1);
1199 bitmap_clear (wont_exit);
1200 opt_info_start_duplication (opt_info);
1201
1202 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1203 nunroll, wont_exit,
1204 NULL, NULL,
1205 DLTHE_FLAG_UPDATE_FREQ
1206 | (opt_info
1207 ? DLTHE_RECORD_COPY_NUMBER
1208 : 0));
1209 gcc_assert (ok);
1210
1211 if (opt_info)
1212 {
1213 apply_opt_in_copies (opt_info, nunroll, true, true);
1214 free_opt_info (opt_info);
1215 }
1216
1217 free (wont_exit);
1218
1219 if (desc->simple_p)
1220 {
1221 /* We indeed may get here provided that there are nontrivial assumptions
1222 for a loop to be really simple. We could update the counts, but the
1223 problem is that we are unable to decide which exit will be taken
1224 (not really true in case the number of iterations is constant,
1225 but no one will do anything with this information, so we do not
1226 worry about it). */
1227 desc->simple_p = false;
1228 }
1229
1230 if (dump_file)
1231 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1232 nunroll, num_loop_insns (loop));
1233 }
1234
1235 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1236 Set *DEBUG_USES to the number of debug insns that reference the
1237 variable. */
1238
1239 static bool
1240 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1241 int *debug_uses)
1242 {
1243 basic_block *body, bb;
1244 unsigned i;
1245 int count_ref = 0;
1246 rtx_insn *insn;
1247
1248 body = get_loop_body (loop);
1249 for (i = 0; i < loop->num_nodes; i++)
1250 {
1251 bb = body[i];
1252
1253 FOR_BB_INSNS (bb, insn)
1254 if (!rtx_referenced_p (reg, insn))
1255 continue;
1256 else if (DEBUG_INSN_P (insn))
1257 ++*debug_uses;
1258 else if (++count_ref > 1)
1259 break;
1260 }
1261 free (body);
1262 return (count_ref == 1);
1263 }
1264
1265 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1266
1267 static void
1268 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1269 {
1270 basic_block *body, bb;
1271 unsigned i;
1272 rtx_insn *insn;
1273
1274 body = get_loop_body (loop);
1275 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1276 {
1277 bb = body[i];
1278
1279 FOR_BB_INSNS (bb, insn)
1280 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1281 continue;
1282 else
1283 {
1284 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1285 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1286 if (!--debug_uses)
1287 break;
1288 }
1289 }
1290 free (body);
1291 }
1292
1293 /* Determine whether INSN contains an accumulator
1294 which can be expanded into separate copies,
1295 one for each copy of the LOOP body.
1296
1297 for (i = 0 ; i < n; i++)
1298 sum += a[i];
1299
1300 ==>
1301
1302 sum += a[i]
1303 ....
1304 i = i+1;
1305 sum1 += a[i]
1306 ....
1307 i = i+1
1308 sum2 += a[i];
1309 ....
1310
1311 Return NULL if INSN contains no opportunity for expansion of accumulator.
1312 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1313 information and return a pointer to it.
1314 */
1315
1316 static struct var_to_expand *
1317 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1318 {
1319 rtx set, dest, src;
1320 struct var_to_expand *ves;
1321 unsigned accum_pos;
1322 enum rtx_code code;
1323 int debug_uses = 0;
1324
1325 set = single_set (insn);
1326 if (!set)
1327 return NULL;
1328
1329 dest = SET_DEST (set);
1330 src = SET_SRC (set);
1331 code = GET_CODE (src);
1332
1333 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1334 return NULL;
1335
1336 if (FLOAT_MODE_P (GET_MODE (dest)))
1337 {
1338 if (!flag_associative_math)
1339 return NULL;
1340 /* In the case of FMA, we're also changing the rounding. */
1341 if (code == FMA && !flag_unsafe_math_optimizations)
1342 return NULL;
1343 }
1344
1345 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1346 in MD. But if there is no optab to generate the insn, we can not
1347 perform the variable expansion. This can happen if an MD provides
1348 an insn but not a named pattern to generate it, for example to avoid
1349 producing code that needs additional mode switches like for x87/mmx.
1350
1351 So we check have_insn_for which looks for an optab for the operation
1352 in SRC. If it doesn't exist, we can't perform the expansion even
1353 though INSN is valid. */
1354 if (!have_insn_for (code, GET_MODE (src)))
1355 return NULL;
1356
1357 if (!REG_P (dest)
1358 && !(GET_CODE (dest) == SUBREG
1359 && REG_P (SUBREG_REG (dest))))
1360 return NULL;
1361
1362 /* Find the accumulator use within the operation. */
1363 if (code == FMA)
1364 {
1365 /* We only support accumulation via FMA in the ADD position. */
1366 if (!rtx_equal_p (dest, XEXP (src, 2)))
1367 return NULL;
1368 accum_pos = 2;
1369 }
1370 else if (rtx_equal_p (dest, XEXP (src, 0)))
1371 accum_pos = 0;
1372 else if (rtx_equal_p (dest, XEXP (src, 1)))
1373 {
1374 /* The method of expansion that we are using; which includes the
1375 initialization of the expansions with zero and the summation of
1376 the expansions at the end of the computation will yield wrong
1377 results for (x = something - x) thus avoid using it in that case. */
1378 if (code == MINUS)
1379 return NULL;
1380 accum_pos = 1;
1381 }
1382 else
1383 return NULL;
1384
1385 /* It must not otherwise be used. */
1386 if (code == FMA)
1387 {
1388 if (rtx_referenced_p (dest, XEXP (src, 0))
1389 || rtx_referenced_p (dest, XEXP (src, 1)))
1390 return NULL;
1391 }
1392 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1393 return NULL;
1394
1395 /* It must be used in exactly one insn. */
1396 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1397 return NULL;
1398
1399 if (dump_file)
1400 {
1401 fprintf (dump_file, "\n;; Expanding Accumulator ");
1402 print_rtl (dump_file, dest);
1403 fprintf (dump_file, "\n");
1404 }
1405
1406 if (debug_uses)
1407 /* Instead of resetting the debug insns, we could replace each
1408 debug use in the loop with the sum or product of all expanded
1409 accummulators. Since we'll only know of all expansions at the
1410 end, we'd have to keep track of which vars_to_expand a debug
1411 insn in the loop references, take note of each copy of the
1412 debug insn during unrolling, and when it's all done, compute
1413 the sum or product of each variable and adjust the original
1414 debug insn and each copy thereof. What a pain! */
1415 reset_debug_uses_in_loop (loop, dest, debug_uses);
1416
1417 /* Record the accumulator to expand. */
1418 ves = XNEW (struct var_to_expand);
1419 ves->insn = insn;
1420 ves->reg = copy_rtx (dest);
1421 ves->var_expansions.create (1);
1422 ves->next = NULL;
1423 ves->op = GET_CODE (src);
1424 ves->expansion_count = 0;
1425 ves->reuse_expansion = 0;
1426 return ves;
1427 }
1428
1429 /* Determine whether there is an induction variable in INSN that
1430 we would like to split during unrolling.
1431
1432 I.e. replace
1433
1434 i = i + 1;
1435 ...
1436 i = i + 1;
1437 ...
1438 i = i + 1;
1439 ...
1440
1441 type chains by
1442
1443 i0 = i + 1
1444 ...
1445 i = i0 + 1
1446 ...
1447 i = i0 + 2
1448 ...
1449
1450 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1451 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1452 pointer to it. */
1453
1454 static struct iv_to_split *
1455 analyze_iv_to_split_insn (rtx_insn *insn)
1456 {
1457 rtx set, dest;
1458 struct rtx_iv iv;
1459 struct iv_to_split *ivts;
1460 bool ok;
1461
1462 /* For now we just split the basic induction variables. Later this may be
1463 extended for example by selecting also addresses of memory references. */
1464 set = single_set (insn);
1465 if (!set)
1466 return NULL;
1467
1468 dest = SET_DEST (set);
1469 if (!REG_P (dest))
1470 return NULL;
1471
1472 if (!biv_p (insn, dest))
1473 return NULL;
1474
1475 ok = iv_analyze_result (insn, dest, &iv);
1476
1477 /* This used to be an assert under the assumption that if biv_p returns
1478 true that iv_analyze_result must also return true. However, that
1479 assumption is not strictly correct as evidenced by pr25569.
1480
1481 Returning NULL when iv_analyze_result returns false is safe and
1482 avoids the problems in pr25569 until the iv_analyze_* routines
1483 can be fixed, which is apparently hard and time consuming
1484 according to their author. */
1485 if (! ok)
1486 return NULL;
1487
1488 if (iv.step == const0_rtx
1489 || iv.mode != iv.extend_mode)
1490 return NULL;
1491
1492 /* Record the insn to split. */
1493 ivts = XNEW (struct iv_to_split);
1494 ivts->insn = insn;
1495 ivts->orig_var = dest;
1496 ivts->base_var = NULL_RTX;
1497 ivts->step = iv.step;
1498 ivts->next = NULL;
1499
1500 return ivts;
1501 }
1502
1503 /* Determines which of insns in LOOP can be optimized.
1504 Return a OPT_INFO struct with the relevant hash tables filled
1505 with all insns to be optimized. The FIRST_NEW_BLOCK field
1506 is undefined for the return value. */
1507
1508 static struct opt_info *
1509 analyze_insns_in_loop (struct loop *loop)
1510 {
1511 basic_block *body, bb;
1512 unsigned i;
1513 struct opt_info *opt_info = XCNEW (struct opt_info);
1514 rtx_insn *insn;
1515 struct iv_to_split *ivts = NULL;
1516 struct var_to_expand *ves = NULL;
1517 iv_to_split **slot1;
1518 var_to_expand **slot2;
1519 vec<edge> edges = get_loop_exit_edges (loop);
1520 edge exit;
1521 bool can_apply = false;
1522
1523 iv_analysis_loop_init (loop);
1524
1525 body = get_loop_body (loop);
1526
1527 if (flag_split_ivs_in_unroller)
1528 {
1529 opt_info->insns_to_split
1530 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1531 opt_info->iv_to_split_head = NULL;
1532 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1533 }
1534
1535 /* Record the loop exit bb and loop preheader before the unrolling. */
1536 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1537
1538 if (edges.length () == 1)
1539 {
1540 exit = edges[0];
1541 if (!(exit->flags & EDGE_COMPLEX))
1542 {
1543 opt_info->loop_exit = split_edge (exit);
1544 can_apply = true;
1545 }
1546 }
1547
1548 if (flag_variable_expansion_in_unroller
1549 && can_apply)
1550 {
1551 opt_info->insns_with_var_to_expand
1552 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1553 opt_info->var_to_expand_head = NULL;
1554 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1555 }
1556
1557 for (i = 0; i < loop->num_nodes; i++)
1558 {
1559 bb = body[i];
1560 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1561 continue;
1562
1563 FOR_BB_INSNS (bb, insn)
1564 {
1565 if (!INSN_P (insn))
1566 continue;
1567
1568 if (opt_info->insns_to_split)
1569 ivts = analyze_iv_to_split_insn (insn);
1570
1571 if (ivts)
1572 {
1573 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1574 gcc_assert (*slot1 == NULL);
1575 *slot1 = ivts;
1576 *opt_info->iv_to_split_tail = ivts;
1577 opt_info->iv_to_split_tail = &ivts->next;
1578 continue;
1579 }
1580
1581 if (opt_info->insns_with_var_to_expand)
1582 ves = analyze_insn_to_expand_var (loop, insn);
1583
1584 if (ves)
1585 {
1586 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1587 gcc_assert (*slot2 == NULL);
1588 *slot2 = ves;
1589 *opt_info->var_to_expand_tail = ves;
1590 opt_info->var_to_expand_tail = &ves->next;
1591 }
1592 }
1593 }
1594
1595 edges.release ();
1596 free (body);
1597 return opt_info;
1598 }
1599
1600 /* Called just before loop duplication. Records start of duplicated area
1601 to OPT_INFO. */
1602
1603 static void
1604 opt_info_start_duplication (struct opt_info *opt_info)
1605 {
1606 if (opt_info)
1607 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1608 }
1609
1610 /* Determine the number of iterations between initialization of the base
1611 variable and the current copy (N_COPY). N_COPIES is the total number
1612 of newly created copies. UNROLLING is true if we are unrolling
1613 (not peeling) the loop. */
1614
1615 static unsigned
1616 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1617 {
1618 if (unrolling)
1619 {
1620 /* If we are unrolling, initialization is done in the original loop
1621 body (number 0). */
1622 return n_copy;
1623 }
1624 else
1625 {
1626 /* If we are peeling, the copy in that the initialization occurs has
1627 number 1. The original loop (number 0) is the last. */
1628 if (n_copy)
1629 return n_copy - 1;
1630 else
1631 return n_copies;
1632 }
1633 }
1634
1635 /* Allocate basic variable for the induction variable chain. */
1636
1637 static void
1638 allocate_basic_variable (struct iv_to_split *ivts)
1639 {
1640 rtx expr = SET_SRC (single_set (ivts->insn));
1641
1642 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1643 }
1644
1645 /* Insert initialization of basic variable of IVTS before INSN, taking
1646 the initial value from INSN. */
1647
1648 static void
1649 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1650 {
1651 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1652 rtx_insn *seq;
1653
1654 start_sequence ();
1655 expr = force_operand (expr, ivts->base_var);
1656 if (expr != ivts->base_var)
1657 emit_move_insn (ivts->base_var, expr);
1658 seq = get_insns ();
1659 end_sequence ();
1660
1661 emit_insn_before (seq, insn);
1662 }
1663
1664 /* Replace the use of induction variable described in IVTS in INSN
1665 by base variable + DELTA * step. */
1666
1667 static void
1668 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1669 {
1670 rtx expr, *loc, incr, var;
1671 rtx_insn *seq;
1672 enum machine_mode mode = GET_MODE (ivts->base_var);
1673 rtx src, dest, set;
1674
1675 /* Construct base + DELTA * step. */
1676 if (!delta)
1677 expr = ivts->base_var;
1678 else
1679 {
1680 incr = simplify_gen_binary (MULT, mode,
1681 ivts->step, gen_int_mode (delta, mode));
1682 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1683 ivts->base_var, incr);
1684 }
1685
1686 /* Figure out where to do the replacement. */
1687 loc = &SET_SRC (single_set (insn));
1688
1689 /* If we can make the replacement right away, we're done. */
1690 if (validate_change (insn, loc, expr, 0))
1691 return;
1692
1693 /* Otherwise, force EXPR into a register and try again. */
1694 start_sequence ();
1695 var = gen_reg_rtx (mode);
1696 expr = force_operand (expr, var);
1697 if (expr != var)
1698 emit_move_insn (var, expr);
1699 seq = get_insns ();
1700 end_sequence ();
1701 emit_insn_before (seq, insn);
1702
1703 if (validate_change (insn, loc, var, 0))
1704 return;
1705
1706 /* The last chance. Try recreating the assignment in insn
1707 completely from scratch. */
1708 set = single_set (insn);
1709 gcc_assert (set);
1710
1711 start_sequence ();
1712 *loc = var;
1713 src = copy_rtx (SET_SRC (set));
1714 dest = copy_rtx (SET_DEST (set));
1715 src = force_operand (src, dest);
1716 if (src != dest)
1717 emit_move_insn (dest, src);
1718 seq = get_insns ();
1719 end_sequence ();
1720
1721 emit_insn_before (seq, insn);
1722 delete_insn (insn);
1723 }
1724
1725
1726 /* Return one expansion of the accumulator recorded in struct VE. */
1727
1728 static rtx
1729 get_expansion (struct var_to_expand *ve)
1730 {
1731 rtx reg;
1732
1733 if (ve->reuse_expansion == 0)
1734 reg = ve->reg;
1735 else
1736 reg = ve->var_expansions[ve->reuse_expansion - 1];
1737
1738 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1739 ve->reuse_expansion = 0;
1740 else
1741 ve->reuse_expansion++;
1742
1743 return reg;
1744 }
1745
1746
1747 /* Given INSN replace the uses of the accumulator recorded in VE
1748 with a new register. */
1749
1750 static void
1751 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1752 {
1753 rtx new_reg, set;
1754 bool really_new_expansion = false;
1755
1756 set = single_set (insn);
1757 gcc_assert (set);
1758
1759 /* Generate a new register only if the expansion limit has not been
1760 reached. Else reuse an already existing expansion. */
1761 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1762 {
1763 really_new_expansion = true;
1764 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1765 }
1766 else
1767 new_reg = get_expansion (ve);
1768
1769 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1770 if (apply_change_group ())
1771 if (really_new_expansion)
1772 {
1773 ve->var_expansions.safe_push (new_reg);
1774 ve->expansion_count++;
1775 }
1776 }
1777
1778 /* Initialize the variable expansions in loop preheader. PLACE is the
1779 loop-preheader basic block where the initialization of the
1780 expansions should take place. The expansions are initialized with
1781 (-0) when the operation is plus or minus to honor sign zero. This
1782 way we can prevent cases where the sign of the final result is
1783 effected by the sign of the expansion. Here is an example to
1784 demonstrate this:
1785
1786 for (i = 0 ; i < n; i++)
1787 sum += something;
1788
1789 ==>
1790
1791 sum += something
1792 ....
1793 i = i+1;
1794 sum1 += something
1795 ....
1796 i = i+1
1797 sum2 += something;
1798 ....
1799
1800 When SUM is initialized with -zero and SOMETHING is also -zero; the
1801 final result of sum should be -zero thus the expansions sum1 and sum2
1802 should be initialized with -zero as well (otherwise we will get +zero
1803 as the final result). */
1804
1805 static void
1806 insert_var_expansion_initialization (struct var_to_expand *ve,
1807 basic_block place)
1808 {
1809 rtx_insn *seq;
1810 rtx var, zero_init;
1811 unsigned i;
1812 enum machine_mode mode = GET_MODE (ve->reg);
1813 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1814
1815 if (ve->var_expansions.length () == 0)
1816 return;
1817
1818 start_sequence ();
1819 switch (ve->op)
1820 {
1821 case FMA:
1822 /* Note that we only accumulate FMA via the ADD operand. */
1823 case PLUS:
1824 case MINUS:
1825 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1826 {
1827 if (honor_signed_zero_p)
1828 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1829 else
1830 zero_init = CONST0_RTX (mode);
1831 emit_move_insn (var, zero_init);
1832 }
1833 break;
1834
1835 case MULT:
1836 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1837 {
1838 zero_init = CONST1_RTX (GET_MODE (var));
1839 emit_move_insn (var, zero_init);
1840 }
1841 break;
1842
1843 default:
1844 gcc_unreachable ();
1845 }
1846
1847 seq = get_insns ();
1848 end_sequence ();
1849
1850 emit_insn_after (seq, BB_END (place));
1851 }
1852
1853 /* Combine the variable expansions at the loop exit. PLACE is the
1854 loop exit basic block where the summation of the expansions should
1855 take place. */
1856
1857 static void
1858 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1859 {
1860 rtx sum = ve->reg;
1861 rtx expr, var;
1862 rtx_insn *seq, *insn;
1863 unsigned i;
1864
1865 if (ve->var_expansions.length () == 0)
1866 return;
1867
1868 start_sequence ();
1869 switch (ve->op)
1870 {
1871 case FMA:
1872 /* Note that we only accumulate FMA via the ADD operand. */
1873 case PLUS:
1874 case MINUS:
1875 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1876 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1877 break;
1878
1879 case MULT:
1880 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1881 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1882 break;
1883
1884 default:
1885 gcc_unreachable ();
1886 }
1887
1888 expr = force_operand (sum, ve->reg);
1889 if (expr != ve->reg)
1890 emit_move_insn (ve->reg, expr);
1891 seq = get_insns ();
1892 end_sequence ();
1893
1894 insn = BB_HEAD (place);
1895 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1896 insn = NEXT_INSN (insn);
1897
1898 emit_insn_after (seq, insn);
1899 }
1900
1901 /* Strip away REG_EQUAL notes for IVs we're splitting.
1902
1903 Updating REG_EQUAL notes for IVs we split is tricky: We
1904 cannot tell until after unrolling, DF-rescanning, and liveness
1905 updating, whether an EQ_USE is reached by the split IV while
1906 the IV reg is still live. See PR55006.
1907
1908 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1909 because RTL loop-iv requires us to defer rescanning insns and
1910 any notes attached to them. So resort to old techniques... */
1911
1912 static void
1913 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1914 {
1915 struct iv_to_split *ivts;
1916 rtx note = find_reg_equal_equiv_note (insn);
1917 if (! note)
1918 return;
1919 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1920 if (reg_mentioned_p (ivts->orig_var, note))
1921 {
1922 remove_note (insn, note);
1923 return;
1924 }
1925 }
1926
1927 /* Apply loop optimizations in loop copies using the
1928 data which gathered during the unrolling. Structure
1929 OPT_INFO record that data.
1930
1931 UNROLLING is true if we unrolled (not peeled) the loop.
1932 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1933 the loop (as it should happen in complete unrolling, but not in ordinary
1934 peeling of the loop). */
1935
1936 static void
1937 apply_opt_in_copies (struct opt_info *opt_info,
1938 unsigned n_copies, bool unrolling,
1939 bool rewrite_original_loop)
1940 {
1941 unsigned i, delta;
1942 basic_block bb, orig_bb;
1943 rtx_insn *insn, *orig_insn, *next;
1944 struct iv_to_split ivts_templ, *ivts;
1945 struct var_to_expand ve_templ, *ves;
1946
1947 /* Sanity check -- we need to put initialization in the original loop
1948 body. */
1949 gcc_assert (!unrolling || rewrite_original_loop);
1950
1951 /* Allocate the basic variables (i0). */
1952 if (opt_info->insns_to_split)
1953 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1954 allocate_basic_variable (ivts);
1955
1956 for (i = opt_info->first_new_block;
1957 i < (unsigned) last_basic_block_for_fn (cfun);
1958 i++)
1959 {
1960 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1961 orig_bb = get_bb_original (bb);
1962
1963 /* bb->aux holds position in copy sequence initialized by
1964 duplicate_loop_to_header_edge. */
1965 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1966 unrolling);
1967 bb->aux = 0;
1968 orig_insn = BB_HEAD (orig_bb);
1969 FOR_BB_INSNS_SAFE (bb, insn, next)
1970 {
1971 if (!INSN_P (insn)
1972 || (DEBUG_INSN_P (insn)
1973 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1974 continue;
1975
1976 while (!INSN_P (orig_insn)
1977 || (DEBUG_INSN_P (orig_insn)
1978 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1979 == LABEL_DECL)))
1980 orig_insn = NEXT_INSN (orig_insn);
1981
1982 ivts_templ.insn = orig_insn;
1983 ve_templ.insn = orig_insn;
1984
1985 /* Apply splitting iv optimization. */
1986 if (opt_info->insns_to_split)
1987 {
1988 maybe_strip_eq_note_for_split_iv (opt_info, insn);
1989
1990 ivts = opt_info->insns_to_split->find (&ivts_templ);
1991
1992 if (ivts)
1993 {
1994 gcc_assert (GET_CODE (PATTERN (insn))
1995 == GET_CODE (PATTERN (orig_insn)));
1996
1997 if (!delta)
1998 insert_base_initialization (ivts, insn);
1999 split_iv (ivts, insn, delta);
2000 }
2001 }
2002 /* Apply variable expansion optimization. */
2003 if (unrolling && opt_info->insns_with_var_to_expand)
2004 {
2005 ves = (struct var_to_expand *)
2006 opt_info->insns_with_var_to_expand->find (&ve_templ);
2007 if (ves)
2008 {
2009 gcc_assert (GET_CODE (PATTERN (insn))
2010 == GET_CODE (PATTERN (orig_insn)));
2011 expand_var_during_unrolling (ves, insn);
2012 }
2013 }
2014 orig_insn = NEXT_INSN (orig_insn);
2015 }
2016 }
2017
2018 if (!rewrite_original_loop)
2019 return;
2020
2021 /* Initialize the variable expansions in the loop preheader
2022 and take care of combining them at the loop exit. */
2023 if (opt_info->insns_with_var_to_expand)
2024 {
2025 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2026 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2027 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2028 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2029 }
2030
2031 /* Rewrite also the original loop body. Find them as originals of the blocks
2032 in the last copied iteration, i.e. those that have
2033 get_bb_copy (get_bb_original (bb)) == bb. */
2034 for (i = opt_info->first_new_block;
2035 i < (unsigned) last_basic_block_for_fn (cfun);
2036 i++)
2037 {
2038 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2039 orig_bb = get_bb_original (bb);
2040 if (get_bb_copy (orig_bb) != bb)
2041 continue;
2042
2043 delta = determine_split_iv_delta (0, n_copies, unrolling);
2044 for (orig_insn = BB_HEAD (orig_bb);
2045 orig_insn != NEXT_INSN (BB_END (bb));
2046 orig_insn = next)
2047 {
2048 next = NEXT_INSN (orig_insn);
2049
2050 if (!INSN_P (orig_insn))
2051 continue;
2052
2053 ivts_templ.insn = orig_insn;
2054 if (opt_info->insns_to_split)
2055 {
2056 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2057
2058 ivts = (struct iv_to_split *)
2059 opt_info->insns_to_split->find (&ivts_templ);
2060 if (ivts)
2061 {
2062 if (!delta)
2063 insert_base_initialization (ivts, orig_insn);
2064 split_iv (ivts, orig_insn, delta);
2065 continue;
2066 }
2067 }
2068
2069 }
2070 }
2071 }
2072
2073 /* Release OPT_INFO. */
2074
2075 static void
2076 free_opt_info (struct opt_info *opt_info)
2077 {
2078 delete opt_info->insns_to_split;
2079 opt_info->insns_to_split = NULL;
2080 if (opt_info->insns_with_var_to_expand)
2081 {
2082 struct var_to_expand *ves;
2083
2084 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2085 ves->var_expansions.release ();
2086 delete opt_info->insns_with_var_to_expand;
2087 opt_info->insns_with_var_to_expand = NULL;
2088 }
2089 free (opt_info);
2090 }