alias.c: Reorder #include statements and remove duplicates.
[gcc.git] / gcc / loop-unroll.c
1 /* Loop unrolling.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "target.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "cfghooks.h"
28 #include "expmed.h"
29 #include "optabs.h"
30 #include "emit-rtl.h"
31 #include "recog.h"
32 #include "alias.h"
33 #include "profile.h"
34 #include "cfgrtl.h"
35 #include "cfgloop.h"
36 #include "params.h"
37 #include "flags.h"
38 #include "dojump.h"
39 #include "explow.h"
40 #include "calls.h"
41 #include "varasm.h"
42 #include "stmt.h"
43 #include "expr.h"
44 #include "dumpfile.h"
45
46 /* This pass performs loop unrolling. We only perform this
47 optimization on innermost loops (with single exception) because
48 the impact on performance is greatest here, and we want to avoid
49 unnecessary code size growth. The gain is caused by greater sequentiality
50 of code, better code to optimize for further passes and in some cases
51 by fewer testings of exit conditions. The main problem is code growth,
52 that impacts performance negatively due to effect of caches.
53
54 What we do:
55
56 -- unrolling of loops that roll constant times; this is almost always
57 win, as we get rid of exit condition tests.
58 -- unrolling of loops that roll number of times that we can compute
59 in runtime; we also get rid of exit condition tests here, but there
60 is the extra expense for calculating the number of iterations
61 -- simple unrolling of remaining loops; this is performed only if we
62 are asked to, as the gain is questionable in this case and often
63 it may even slow down the code
64 For more detailed descriptions of each of those, see comments at
65 appropriate function below.
66
67 There is a lot of parameters (defined and described in params.def) that
68 control how much we unroll.
69
70 ??? A great problem is that we don't have a good way how to determine
71 how many times we should unroll the loop; the experiments I have made
72 showed that this choice may affect performance in order of several %.
73 */
74
75 /* Information about induction variables to split. */
76
77 struct iv_to_split
78 {
79 rtx_insn *insn; /* The insn in that the induction variable occurs. */
80 rtx orig_var; /* The variable (register) for the IV before split. */
81 rtx base_var; /* The variable on that the values in the further
82 iterations are based. */
83 rtx step; /* Step of the induction variable. */
84 struct iv_to_split *next; /* Next entry in walking order. */
85 };
86
87 /* Information about accumulators to expand. */
88
89 struct var_to_expand
90 {
91 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
92 rtx reg; /* The accumulator which is expanded. */
93 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
94 struct var_to_expand *next; /* Next entry in walking order. */
95 enum rtx_code op; /* The type of the accumulation - addition, subtraction
96 or multiplication. */
97 int expansion_count; /* Count the number of expansions generated so far. */
98 int reuse_expansion; /* The expansion we intend to reuse to expand
99 the accumulator. If REUSE_EXPANSION is 0 reuse
100 the original accumulator. Else use
101 var_expansions[REUSE_EXPANSION - 1]. */
102 };
103
104 /* Hashtable helper for iv_to_split. */
105
106 struct iv_split_hasher : free_ptr_hash <iv_to_split>
107 {
108 static inline hashval_t hash (const iv_to_split *);
109 static inline bool equal (const iv_to_split *, const iv_to_split *);
110 };
111
112
113 /* A hash function for information about insns to split. */
114
115 inline hashval_t
116 iv_split_hasher::hash (const iv_to_split *ivts)
117 {
118 return (hashval_t) INSN_UID (ivts->insn);
119 }
120
121 /* An equality functions for information about insns to split. */
122
123 inline bool
124 iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
125 {
126 return i1->insn == i2->insn;
127 }
128
129 /* Hashtable helper for iv_to_split. */
130
131 struct var_expand_hasher : free_ptr_hash <var_to_expand>
132 {
133 static inline hashval_t hash (const var_to_expand *);
134 static inline bool equal (const var_to_expand *, const var_to_expand *);
135 };
136
137 /* Return a hash for VES. */
138
139 inline hashval_t
140 var_expand_hasher::hash (const var_to_expand *ves)
141 {
142 return (hashval_t) INSN_UID (ves->insn);
143 }
144
145 /* Return true if I1 and I2 refer to the same instruction. */
146
147 inline bool
148 var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
149 {
150 return i1->insn == i2->insn;
151 }
152
153 /* Information about optimization applied in
154 the unrolled loop. */
155
156 struct opt_info
157 {
158 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
159 split. */
160 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
161 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
162 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
163 insns with accumulators to expand. */
164 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
165 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
166 unsigned first_new_block; /* The first basic block that was
167 duplicated. */
168 basic_block loop_exit; /* The loop exit basic block. */
169 basic_block loop_preheader; /* The loop preheader basic block. */
170 };
171
172 static void decide_unroll_stupid (struct loop *, int);
173 static void decide_unroll_constant_iterations (struct loop *, int);
174 static void decide_unroll_runtime_iterations (struct loop *, int);
175 static void unroll_loop_stupid (struct loop *);
176 static void decide_unrolling (int);
177 static void unroll_loop_constant_iterations (struct loop *);
178 static void unroll_loop_runtime_iterations (struct loop *);
179 static struct opt_info *analyze_insns_in_loop (struct loop *);
180 static void opt_info_start_duplication (struct opt_info *);
181 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
182 static void free_opt_info (struct opt_info *);
183 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *);
184 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
185 static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
186 static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
187 static void insert_var_expansion_initialization (struct var_to_expand *,
188 basic_block);
189 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
190 basic_block);
191 static rtx get_expansion (struct var_to_expand *);
192
193 /* Emit a message summarizing the unroll that will be
194 performed for LOOP, along with the loop's location LOCUS, if
195 appropriate given the dump or -fopt-info settings. */
196
197 static void
198 report_unroll (struct loop *loop, location_t locus)
199 {
200 int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
201
202 if (loop->lpt_decision.decision == LPT_NONE)
203 return;
204
205 if (!dump_enabled_p ())
206 return;
207
208 dump_printf_loc (report_flags, locus,
209 "loop unrolled %d times",
210 loop->lpt_decision.times);
211 if (profile_info)
212 dump_printf (report_flags,
213 " (header execution count %d)",
214 (int)loop->header->count);
215
216 dump_printf (report_flags, "\n");
217 }
218
219 /* Decide whether unroll loops and how much. */
220 static void
221 decide_unrolling (int flags)
222 {
223 struct loop *loop;
224
225 /* Scan the loops, inner ones first. */
226 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
227 {
228 loop->lpt_decision.decision = LPT_NONE;
229 location_t locus = get_loop_location (loop);
230
231 if (dump_enabled_p ())
232 dump_printf_loc (TDF_RTL, locus,
233 ";; *** Considering loop %d at BB %d for "
234 "unrolling ***\n",
235 loop->num, loop->header->index);
236
237 /* Do not peel cold areas. */
238 if (optimize_loop_for_size_p (loop))
239 {
240 if (dump_file)
241 fprintf (dump_file, ";; Not considering loop, cold area\n");
242 continue;
243 }
244
245 /* Can the loop be manipulated? */
246 if (!can_duplicate_loop_p (loop))
247 {
248 if (dump_file)
249 fprintf (dump_file,
250 ";; Not considering loop, cannot duplicate\n");
251 continue;
252 }
253
254 /* Skip non-innermost loops. */
255 if (loop->inner)
256 {
257 if (dump_file)
258 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
259 continue;
260 }
261
262 loop->ninsns = num_loop_insns (loop);
263 loop->av_ninsns = average_num_loop_insns (loop);
264
265 /* Try transformations one by one in decreasing order of
266 priority. */
267
268 decide_unroll_constant_iterations (loop, flags);
269 if (loop->lpt_decision.decision == LPT_NONE)
270 decide_unroll_runtime_iterations (loop, flags);
271 if (loop->lpt_decision.decision == LPT_NONE)
272 decide_unroll_stupid (loop, flags);
273
274 report_unroll (loop, locus);
275 }
276 }
277
278 /* Unroll LOOPS. */
279 void
280 unroll_loops (int flags)
281 {
282 struct loop *loop;
283 bool changed = false;
284
285 /* Now decide rest of unrolling. */
286 decide_unrolling (flags);
287
288 /* Scan the loops, inner ones first. */
289 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
290 {
291 /* And perform the appropriate transformations. */
292 switch (loop->lpt_decision.decision)
293 {
294 case LPT_UNROLL_CONSTANT:
295 unroll_loop_constant_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_RUNTIME:
299 unroll_loop_runtime_iterations (loop);
300 changed = true;
301 break;
302 case LPT_UNROLL_STUPID:
303 unroll_loop_stupid (loop);
304 changed = true;
305 break;
306 case LPT_NONE:
307 break;
308 default:
309 gcc_unreachable ();
310 }
311 }
312
313 if (changed)
314 {
315 calculate_dominance_info (CDI_DOMINATORS);
316 fix_loop_structure (NULL);
317 }
318
319 iv_analysis_done ();
320 }
321
322 /* Check whether exit of the LOOP is at the end of loop body. */
323
324 static bool
325 loop_exit_at_end_p (struct loop *loop)
326 {
327 struct niter_desc *desc = get_simple_loop_desc (loop);
328 rtx_insn *insn;
329
330 /* We should never have conditional in latch block. */
331 gcc_assert (desc->in_edge->dest != loop->header);
332
333 if (desc->in_edge->dest != loop->latch)
334 return false;
335
336 /* Check that the latch is empty. */
337 FOR_BB_INSNS (loop->latch, insn)
338 {
339 if (INSN_P (insn) && active_insn_p (insn))
340 return false;
341 }
342
343 return true;
344 }
345
346 /* Decide whether to unroll LOOP iterating constant number of times
347 and how much. */
348
349 static void
350 decide_unroll_constant_iterations (struct loop *loop, int flags)
351 {
352 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
353 struct niter_desc *desc;
354 widest_int iterations;
355
356 if (!(flags & UAP_UNROLL))
357 {
358 /* We were not asked to, just return back silently. */
359 return;
360 }
361
362 if (dump_file)
363 fprintf (dump_file,
364 "\n;; Considering unrolling loop with constant "
365 "number of iterations\n");
366
367 /* nunroll = total number of copies of the original loop body in
368 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
369 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
370 nunroll_by_av
371 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
372 if (nunroll > nunroll_by_av)
373 nunroll = nunroll_by_av;
374 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
375 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
376
377 if (targetm.loop_unroll_adjust)
378 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
379
380 /* Skip big loops. */
381 if (nunroll <= 1)
382 {
383 if (dump_file)
384 fprintf (dump_file, ";; Not considering loop, is too big\n");
385 return;
386 }
387
388 /* Check for simple loops. */
389 desc = get_simple_loop_desc (loop);
390
391 /* Check number of iterations. */
392 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
393 {
394 if (dump_file)
395 fprintf (dump_file,
396 ";; Unable to prove that the loop iterates constant times\n");
397 return;
398 }
399
400 /* Check whether the loop rolls enough to consider.
401 Consult also loop bounds and profile; in the case the loop has more
402 than one exit it may well loop less than determined maximal number
403 of iterations. */
404 if (desc->niter < 2 * nunroll
405 || ((get_estimated_loop_iterations (loop, &iterations)
406 || get_max_loop_iterations (loop, &iterations))
407 && wi::ltu_p (iterations, 2 * nunroll)))
408 {
409 if (dump_file)
410 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
411 return;
412 }
413
414 /* Success; now compute number of iterations to unroll. We alter
415 nunroll so that as few as possible copies of loop body are
416 necessary, while still not decreasing the number of unrollings
417 too much (at most by 1). */
418 best_copies = 2 * nunroll + 10;
419
420 i = 2 * nunroll + 2;
421 if (i - 1 >= desc->niter)
422 i = desc->niter - 2;
423
424 for (; i >= nunroll - 1; i--)
425 {
426 unsigned exit_mod = desc->niter % (i + 1);
427
428 if (!loop_exit_at_end_p (loop))
429 n_copies = exit_mod + i + 1;
430 else if (exit_mod != (unsigned) i
431 || desc->noloop_assumptions != NULL_RTX)
432 n_copies = exit_mod + i + 2;
433 else
434 n_copies = i + 1;
435
436 if (n_copies < best_copies)
437 {
438 best_copies = n_copies;
439 best_unroll = i;
440 }
441 }
442
443 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
444 loop->lpt_decision.times = best_unroll;
445 }
446
447 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
448 The transformation does this:
449
450 for (i = 0; i < 102; i++)
451 body;
452
453 ==> (LOOP->LPT_DECISION.TIMES == 3)
454
455 i = 0;
456 body; i++;
457 body; i++;
458 while (i < 102)
459 {
460 body; i++;
461 body; i++;
462 body; i++;
463 body; i++;
464 }
465 */
466 static void
467 unroll_loop_constant_iterations (struct loop *loop)
468 {
469 unsigned HOST_WIDE_INT niter;
470 unsigned exit_mod;
471 sbitmap wont_exit;
472 unsigned i;
473 edge e;
474 unsigned max_unroll = loop->lpt_decision.times;
475 struct niter_desc *desc = get_simple_loop_desc (loop);
476 bool exit_at_end = loop_exit_at_end_p (loop);
477 struct opt_info *opt_info = NULL;
478 bool ok;
479
480 niter = desc->niter;
481
482 /* Should not get here (such loop should be peeled instead). */
483 gcc_assert (niter > max_unroll + 1);
484
485 exit_mod = niter % (max_unroll + 1);
486
487 wont_exit = sbitmap_alloc (max_unroll + 1);
488 bitmap_ones (wont_exit);
489
490 auto_vec<edge> remove_edges;
491 if (flag_split_ivs_in_unroller
492 || flag_variable_expansion_in_unroller)
493 opt_info = analyze_insns_in_loop (loop);
494
495 if (!exit_at_end)
496 {
497 /* The exit is not at the end of the loop; leave exit test
498 in the first copy, so that the loops that start with test
499 of exit condition have continuous body after unrolling. */
500
501 if (dump_file)
502 fprintf (dump_file, ";; Condition at beginning of loop.\n");
503
504 /* Peel exit_mod iterations. */
505 bitmap_clear_bit (wont_exit, 0);
506 if (desc->noloop_assumptions)
507 bitmap_clear_bit (wont_exit, 1);
508
509 if (exit_mod)
510 {
511 opt_info_start_duplication (opt_info);
512 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
513 exit_mod,
514 wont_exit, desc->out_edge,
515 &remove_edges,
516 DLTHE_FLAG_UPDATE_FREQ
517 | (opt_info && exit_mod > 1
518 ? DLTHE_RECORD_COPY_NUMBER
519 : 0));
520 gcc_assert (ok);
521
522 if (opt_info && exit_mod > 1)
523 apply_opt_in_copies (opt_info, exit_mod, false, false);
524
525 desc->noloop_assumptions = NULL_RTX;
526 desc->niter -= exit_mod;
527 loop->nb_iterations_upper_bound -= exit_mod;
528 if (loop->any_estimate
529 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
530 loop->nb_iterations_estimate -= exit_mod;
531 else
532 loop->any_estimate = false;
533 }
534
535 bitmap_set_bit (wont_exit, 1);
536 }
537 else
538 {
539 /* Leave exit test in last copy, for the same reason as above if
540 the loop tests the condition at the end of loop body. */
541
542 if (dump_file)
543 fprintf (dump_file, ";; Condition at end of loop.\n");
544
545 /* We know that niter >= max_unroll + 2; so we do not need to care of
546 case when we would exit before reaching the loop. So just peel
547 exit_mod + 1 iterations. */
548 if (exit_mod != max_unroll
549 || desc->noloop_assumptions)
550 {
551 bitmap_clear_bit (wont_exit, 0);
552 if (desc->noloop_assumptions)
553 bitmap_clear_bit (wont_exit, 1);
554
555 opt_info_start_duplication (opt_info);
556 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
557 exit_mod + 1,
558 wont_exit, desc->out_edge,
559 &remove_edges,
560 DLTHE_FLAG_UPDATE_FREQ
561 | (opt_info && exit_mod > 0
562 ? DLTHE_RECORD_COPY_NUMBER
563 : 0));
564 gcc_assert (ok);
565
566 if (opt_info && exit_mod > 0)
567 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
568
569 desc->niter -= exit_mod + 1;
570 loop->nb_iterations_upper_bound -= exit_mod + 1;
571 if (loop->any_estimate
572 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
573 loop->nb_iterations_estimate -= exit_mod + 1;
574 else
575 loop->any_estimate = false;
576 desc->noloop_assumptions = NULL_RTX;
577
578 bitmap_set_bit (wont_exit, 0);
579 bitmap_set_bit (wont_exit, 1);
580 }
581
582 bitmap_clear_bit (wont_exit, max_unroll);
583 }
584
585 /* Now unroll the loop. */
586
587 opt_info_start_duplication (opt_info);
588 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
589 max_unroll,
590 wont_exit, desc->out_edge,
591 &remove_edges,
592 DLTHE_FLAG_UPDATE_FREQ
593 | (opt_info
594 ? DLTHE_RECORD_COPY_NUMBER
595 : 0));
596 gcc_assert (ok);
597
598 if (opt_info)
599 {
600 apply_opt_in_copies (opt_info, max_unroll, true, true);
601 free_opt_info (opt_info);
602 }
603
604 free (wont_exit);
605
606 if (exit_at_end)
607 {
608 basic_block exit_block = get_bb_copy (desc->in_edge->src);
609 /* Find a new in and out edge; they are in the last copy we have made. */
610
611 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
612 {
613 desc->out_edge = EDGE_SUCC (exit_block, 0);
614 desc->in_edge = EDGE_SUCC (exit_block, 1);
615 }
616 else
617 {
618 desc->out_edge = EDGE_SUCC (exit_block, 1);
619 desc->in_edge = EDGE_SUCC (exit_block, 0);
620 }
621 }
622
623 desc->niter /= max_unroll + 1;
624 loop->nb_iterations_upper_bound
625 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
626 if (loop->any_estimate)
627 loop->nb_iterations_estimate
628 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
629 desc->niter_expr = GEN_INT (desc->niter);
630
631 /* Remove the edges. */
632 FOR_EACH_VEC_ELT (remove_edges, i, e)
633 remove_path (e);
634
635 if (dump_file)
636 fprintf (dump_file,
637 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
638 max_unroll, num_loop_insns (loop));
639 }
640
641 /* Decide whether to unroll LOOP iterating runtime computable number of times
642 and how much. */
643 static void
644 decide_unroll_runtime_iterations (struct loop *loop, int flags)
645 {
646 unsigned nunroll, nunroll_by_av, i;
647 struct niter_desc *desc;
648 widest_int iterations;
649
650 if (!(flags & UAP_UNROLL))
651 {
652 /* We were not asked to, just return back silently. */
653 return;
654 }
655
656 if (dump_file)
657 fprintf (dump_file,
658 "\n;; Considering unrolling loop with runtime "
659 "computable number of iterations\n");
660
661 /* nunroll = total number of copies of the original loop body in
662 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
663 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
664 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
665 if (nunroll > nunroll_by_av)
666 nunroll = nunroll_by_av;
667 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
668 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
669
670 if (targetm.loop_unroll_adjust)
671 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
672
673 /* Skip big loops. */
674 if (nunroll <= 1)
675 {
676 if (dump_file)
677 fprintf (dump_file, ";; Not considering loop, is too big\n");
678 return;
679 }
680
681 /* Check for simple loops. */
682 desc = get_simple_loop_desc (loop);
683
684 /* Check simpleness. */
685 if (!desc->simple_p || desc->assumptions)
686 {
687 if (dump_file)
688 fprintf (dump_file,
689 ";; Unable to prove that the number of iterations "
690 "can be counted in runtime\n");
691 return;
692 }
693
694 if (desc->const_iter)
695 {
696 if (dump_file)
697 fprintf (dump_file, ";; Loop iterates constant times\n");
698 return;
699 }
700
701 /* Check whether the loop rolls. */
702 if ((get_estimated_loop_iterations (loop, &iterations)
703 || get_max_loop_iterations (loop, &iterations))
704 && wi::ltu_p (iterations, 2 * nunroll))
705 {
706 if (dump_file)
707 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
708 return;
709 }
710
711 /* Success; now force nunroll to be power of 2, as we are unable to
712 cope with overflows in computation of number of iterations. */
713 for (i = 1; 2 * i <= nunroll; i *= 2)
714 continue;
715
716 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
717 loop->lpt_decision.times = i - 1;
718 }
719
720 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
721 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
722 and NULL is returned instead. */
723
724 basic_block
725 split_edge_and_insert (edge e, rtx_insn *insns)
726 {
727 basic_block bb;
728
729 if (!insns)
730 return NULL;
731 bb = split_edge (e);
732 emit_insn_after (insns, BB_END (bb));
733
734 /* ??? We used to assume that INSNS can contain control flow insns, and
735 that we had to try to find sub basic blocks in BB to maintain a valid
736 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
737 and call break_superblocks when going out of cfglayout mode. But it
738 turns out that this never happens; and that if it does ever happen,
739 the verify_flow_info at the end of the RTL loop passes would fail.
740
741 There are two reasons why we expected we could have control flow insns
742 in INSNS. The first is when a comparison has to be done in parts, and
743 the second is when the number of iterations is computed for loops with
744 the number of iterations known at runtime. In both cases, test cases
745 to get control flow in INSNS appear to be impossible to construct:
746
747 * If do_compare_rtx_and_jump needs several branches to do comparison
748 in a mode that needs comparison by parts, we cannot analyze the
749 number of iterations of the loop, and we never get to unrolling it.
750
751 * The code in expand_divmod that was suspected to cause creation of
752 branching code seems to be only accessed for signed division. The
753 divisions used by # of iterations analysis are always unsigned.
754 Problems might arise on architectures that emits branching code
755 for some operations that may appear in the unroller (especially
756 for division), but we have no such architectures.
757
758 Considering all this, it was decided that we should for now assume
759 that INSNS can in theory contain control flow insns, but in practice
760 it never does. So we don't handle the theoretical case, and should
761 a real failure ever show up, we have a pretty good clue for how to
762 fix it. */
763
764 return bb;
765 }
766
767 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
768 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
769 in order to create a jump. */
770
771 static rtx_insn *
772 compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
773 rtx_code_label *label, int prob, rtx_insn *cinsn)
774 {
775 rtx_insn *seq;
776 rtx_jump_insn *jump;
777 rtx cond;
778 machine_mode mode;
779
780 mode = GET_MODE (op0);
781 if (mode == VOIDmode)
782 mode = GET_MODE (op1);
783
784 start_sequence ();
785 if (GET_MODE_CLASS (mode) == MODE_CC)
786 {
787 /* A hack -- there seems to be no easy generic way how to make a
788 conditional jump from a ccmode comparison. */
789 gcc_assert (cinsn);
790 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
791 gcc_assert (GET_CODE (cond) == comp);
792 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
793 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
794 emit_jump_insn (copy_insn (PATTERN (cinsn)));
795 jump = as_a <rtx_jump_insn *> (get_last_insn ());
796 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
797 LABEL_NUSES (JUMP_LABEL (jump))++;
798 redirect_jump (jump, label, 0);
799 }
800 else
801 {
802 gcc_assert (!cinsn);
803
804 op0 = force_operand (op0, NULL_RTX);
805 op1 = force_operand (op1, NULL_RTX);
806 do_compare_rtx_and_jump (op0, op1, comp, 0,
807 mode, NULL_RTX, NULL, label, -1);
808 jump = as_a <rtx_jump_insn *> (get_last_insn ());
809 jump->set_jump_target (label);
810 LABEL_NUSES (label)++;
811 }
812 add_int_reg_note (jump, REG_BR_PROB, prob);
813
814 seq = get_insns ();
815 end_sequence ();
816
817 return seq;
818 }
819
820 /* Unroll LOOP for which we are able to count number of iterations in runtime
821 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
822 extra care for case n < 0):
823
824 for (i = 0; i < n; i++)
825 body;
826
827 ==> (LOOP->LPT_DECISION.TIMES == 3)
828
829 i = 0;
830 mod = n % 4;
831
832 switch (mod)
833 {
834 case 3:
835 body; i++;
836 case 2:
837 body; i++;
838 case 1:
839 body; i++;
840 case 0: ;
841 }
842
843 while (i < n)
844 {
845 body; i++;
846 body; i++;
847 body; i++;
848 body; i++;
849 }
850 */
851 static void
852 unroll_loop_runtime_iterations (struct loop *loop)
853 {
854 rtx old_niter, niter, tmp;
855 rtx_insn *init_code, *branch_code;
856 unsigned i, j, p;
857 basic_block preheader, *body, swtch, ezc_swtch;
858 sbitmap wont_exit;
859 int may_exit_copy;
860 unsigned n_peel;
861 edge e;
862 bool extra_zero_check, last_may_exit;
863 unsigned max_unroll = loop->lpt_decision.times;
864 struct niter_desc *desc = get_simple_loop_desc (loop);
865 bool exit_at_end = loop_exit_at_end_p (loop);
866 struct opt_info *opt_info = NULL;
867 bool ok;
868
869 if (flag_split_ivs_in_unroller
870 || flag_variable_expansion_in_unroller)
871 opt_info = analyze_insns_in_loop (loop);
872
873 /* Remember blocks whose dominators will have to be updated. */
874 auto_vec<basic_block> dom_bbs;
875
876 body = get_loop_body (loop);
877 for (i = 0; i < loop->num_nodes; i++)
878 {
879 vec<basic_block> ldom;
880 basic_block bb;
881
882 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
883 FOR_EACH_VEC_ELT (ldom, j, bb)
884 if (!flow_bb_inside_loop_p (loop, bb))
885 dom_bbs.safe_push (bb);
886
887 ldom.release ();
888 }
889 free (body);
890
891 if (!exit_at_end)
892 {
893 /* Leave exit in first copy (for explanation why see comment in
894 unroll_loop_constant_iterations). */
895 may_exit_copy = 0;
896 n_peel = max_unroll - 1;
897 extra_zero_check = true;
898 last_may_exit = false;
899 }
900 else
901 {
902 /* Leave exit in last copy (for explanation why see comment in
903 unroll_loop_constant_iterations). */
904 may_exit_copy = max_unroll;
905 n_peel = max_unroll;
906 extra_zero_check = false;
907 last_may_exit = true;
908 }
909
910 /* Get expression for number of iterations. */
911 start_sequence ();
912 old_niter = niter = gen_reg_rtx (desc->mode);
913 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
914 if (tmp != niter)
915 emit_move_insn (niter, tmp);
916
917 /* Count modulo by ANDing it with max_unroll; we use the fact that
918 the number of unrollings is a power of two, and thus this is correct
919 even if there is overflow in the computation. */
920 niter = expand_simple_binop (desc->mode, AND,
921 niter, gen_int_mode (max_unroll, desc->mode),
922 NULL_RTX, 0, OPTAB_LIB_WIDEN);
923
924 init_code = get_insns ();
925 end_sequence ();
926 unshare_all_rtl_in_chain (init_code);
927
928 /* Precondition the loop. */
929 split_edge_and_insert (loop_preheader_edge (loop), init_code);
930
931 auto_vec<edge> remove_edges;
932
933 wont_exit = sbitmap_alloc (max_unroll + 2);
934
935 /* Peel the first copy of loop body (almost always we must leave exit test
936 here; the only exception is when we have extra zero check and the number
937 of iterations is reliable. Also record the place of (possible) extra
938 zero check. */
939 bitmap_clear (wont_exit);
940 if (extra_zero_check
941 && !desc->noloop_assumptions)
942 bitmap_set_bit (wont_exit, 1);
943 ezc_swtch = loop_preheader_edge (loop)->src;
944 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
945 1, wont_exit, desc->out_edge,
946 &remove_edges,
947 DLTHE_FLAG_UPDATE_FREQ);
948 gcc_assert (ok);
949
950 /* Record the place where switch will be built for preconditioning. */
951 swtch = split_edge (loop_preheader_edge (loop));
952
953 for (i = 0; i < n_peel; i++)
954 {
955 /* Peel the copy. */
956 bitmap_clear (wont_exit);
957 if (i != n_peel - 1 || !last_may_exit)
958 bitmap_set_bit (wont_exit, 1);
959 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
960 1, wont_exit, desc->out_edge,
961 &remove_edges,
962 DLTHE_FLAG_UPDATE_FREQ);
963 gcc_assert (ok);
964
965 /* Create item for switch. */
966 j = n_peel - i - (extra_zero_check ? 0 : 1);
967 p = REG_BR_PROB_BASE / (i + 2);
968
969 preheader = split_edge (loop_preheader_edge (loop));
970 branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
971 block_label (preheader), p,
972 NULL);
973
974 /* We rely on the fact that the compare and jump cannot be optimized out,
975 and hence the cfg we create is correct. */
976 gcc_assert (branch_code != NULL_RTX);
977
978 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
979 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
980 single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
981 e = make_edge (swtch, preheader,
982 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
983 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
984 e->probability = p;
985 }
986
987 if (extra_zero_check)
988 {
989 /* Add branch for zero iterations. */
990 p = REG_BR_PROB_BASE / (max_unroll + 1);
991 swtch = ezc_swtch;
992 preheader = split_edge (loop_preheader_edge (loop));
993 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
994 block_label (preheader), p,
995 NULL);
996 gcc_assert (branch_code != NULL_RTX);
997
998 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
999 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1000 single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1001 e = make_edge (swtch, preheader,
1002 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1003 e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1004 e->probability = p;
1005 }
1006
1007 /* Recount dominators for outer blocks. */
1008 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1009
1010 /* And unroll loop. */
1011
1012 bitmap_ones (wont_exit);
1013 bitmap_clear_bit (wont_exit, may_exit_copy);
1014 opt_info_start_duplication (opt_info);
1015
1016 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1017 max_unroll,
1018 wont_exit, desc->out_edge,
1019 &remove_edges,
1020 DLTHE_FLAG_UPDATE_FREQ
1021 | (opt_info
1022 ? DLTHE_RECORD_COPY_NUMBER
1023 : 0));
1024 gcc_assert (ok);
1025
1026 if (opt_info)
1027 {
1028 apply_opt_in_copies (opt_info, max_unroll, true, true);
1029 free_opt_info (opt_info);
1030 }
1031
1032 free (wont_exit);
1033
1034 if (exit_at_end)
1035 {
1036 basic_block exit_block = get_bb_copy (desc->in_edge->src);
1037 /* Find a new in and out edge; they are in the last copy we have
1038 made. */
1039
1040 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1041 {
1042 desc->out_edge = EDGE_SUCC (exit_block, 0);
1043 desc->in_edge = EDGE_SUCC (exit_block, 1);
1044 }
1045 else
1046 {
1047 desc->out_edge = EDGE_SUCC (exit_block, 1);
1048 desc->in_edge = EDGE_SUCC (exit_block, 0);
1049 }
1050 }
1051
1052 /* Remove the edges. */
1053 FOR_EACH_VEC_ELT (remove_edges, i, e)
1054 remove_path (e);
1055
1056 /* We must be careful when updating the number of iterations due to
1057 preconditioning and the fact that the value must be valid at entry
1058 of the loop. After passing through the above code, we see that
1059 the correct new number of iterations is this: */
1060 gcc_assert (!desc->const_iter);
1061 desc->niter_expr =
1062 simplify_gen_binary (UDIV, desc->mode, old_niter,
1063 gen_int_mode (max_unroll + 1, desc->mode));
1064 loop->nb_iterations_upper_bound
1065 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1066 if (loop->any_estimate)
1067 loop->nb_iterations_estimate
1068 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1069 if (exit_at_end)
1070 {
1071 desc->niter_expr =
1072 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1073 desc->noloop_assumptions = NULL_RTX;
1074 --loop->nb_iterations_upper_bound;
1075 if (loop->any_estimate
1076 && loop->nb_iterations_estimate != 0)
1077 --loop->nb_iterations_estimate;
1078 else
1079 loop->any_estimate = false;
1080 }
1081
1082 if (dump_file)
1083 fprintf (dump_file,
1084 ";; Unrolled loop %d times, counting # of iterations "
1085 "in runtime, %i insns\n",
1086 max_unroll, num_loop_insns (loop));
1087 }
1088
1089 /* Decide whether to unroll LOOP stupidly and how much. */
1090 static void
1091 decide_unroll_stupid (struct loop *loop, int flags)
1092 {
1093 unsigned nunroll, nunroll_by_av, i;
1094 struct niter_desc *desc;
1095 widest_int iterations;
1096
1097 if (!(flags & UAP_UNROLL_ALL))
1098 {
1099 /* We were not asked to, just return back silently. */
1100 return;
1101 }
1102
1103 if (dump_file)
1104 fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1105
1106 /* nunroll = total number of copies of the original loop body in
1107 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1108 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1109 nunroll_by_av
1110 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1111 if (nunroll > nunroll_by_av)
1112 nunroll = nunroll_by_av;
1113 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1114 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1115
1116 if (targetm.loop_unroll_adjust)
1117 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1118
1119 /* Skip big loops. */
1120 if (nunroll <= 1)
1121 {
1122 if (dump_file)
1123 fprintf (dump_file, ";; Not considering loop, is too big\n");
1124 return;
1125 }
1126
1127 /* Check for simple loops. */
1128 desc = get_simple_loop_desc (loop);
1129
1130 /* Check simpleness. */
1131 if (desc->simple_p && !desc->assumptions)
1132 {
1133 if (dump_file)
1134 fprintf (dump_file, ";; The loop is simple\n");
1135 return;
1136 }
1137
1138 /* Do not unroll loops with branches inside -- it increases number
1139 of mispredicts.
1140 TODO: this heuristic needs tunning; call inside the loop body
1141 is also relatively good reason to not unroll. */
1142 if (num_loop_branches (loop) > 1)
1143 {
1144 if (dump_file)
1145 fprintf (dump_file, ";; Not unrolling, contains branches\n");
1146 return;
1147 }
1148
1149 /* Check whether the loop rolls. */
1150 if ((get_estimated_loop_iterations (loop, &iterations)
1151 || get_max_loop_iterations (loop, &iterations))
1152 && wi::ltu_p (iterations, 2 * nunroll))
1153 {
1154 if (dump_file)
1155 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1156 return;
1157 }
1158
1159 /* Success. Now force nunroll to be power of 2, as it seems that this
1160 improves results (partially because of better alignments, partially
1161 because of some dark magic). */
1162 for (i = 1; 2 * i <= nunroll; i *= 2)
1163 continue;
1164
1165 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1166 loop->lpt_decision.times = i - 1;
1167 }
1168
1169 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1170
1171 while (cond)
1172 body;
1173
1174 ==> (LOOP->LPT_DECISION.TIMES == 3)
1175
1176 while (cond)
1177 {
1178 body;
1179 if (!cond) break;
1180 body;
1181 if (!cond) break;
1182 body;
1183 if (!cond) break;
1184 body;
1185 }
1186 */
1187 static void
1188 unroll_loop_stupid (struct loop *loop)
1189 {
1190 sbitmap wont_exit;
1191 unsigned nunroll = loop->lpt_decision.times;
1192 struct niter_desc *desc = get_simple_loop_desc (loop);
1193 struct opt_info *opt_info = NULL;
1194 bool ok;
1195
1196 if (flag_split_ivs_in_unroller
1197 || flag_variable_expansion_in_unroller)
1198 opt_info = analyze_insns_in_loop (loop);
1199
1200
1201 wont_exit = sbitmap_alloc (nunroll + 1);
1202 bitmap_clear (wont_exit);
1203 opt_info_start_duplication (opt_info);
1204
1205 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1206 nunroll, wont_exit,
1207 NULL, NULL,
1208 DLTHE_FLAG_UPDATE_FREQ
1209 | (opt_info
1210 ? DLTHE_RECORD_COPY_NUMBER
1211 : 0));
1212 gcc_assert (ok);
1213
1214 if (opt_info)
1215 {
1216 apply_opt_in_copies (opt_info, nunroll, true, true);
1217 free_opt_info (opt_info);
1218 }
1219
1220 free (wont_exit);
1221
1222 if (desc->simple_p)
1223 {
1224 /* We indeed may get here provided that there are nontrivial assumptions
1225 for a loop to be really simple. We could update the counts, but the
1226 problem is that we are unable to decide which exit will be taken
1227 (not really true in case the number of iterations is constant,
1228 but no one will do anything with this information, so we do not
1229 worry about it). */
1230 desc->simple_p = false;
1231 }
1232
1233 if (dump_file)
1234 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1235 nunroll, num_loop_insns (loop));
1236 }
1237
1238 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1239 Set *DEBUG_USES to the number of debug insns that reference the
1240 variable. */
1241
1242 static bool
1243 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1244 int *debug_uses)
1245 {
1246 basic_block *body, bb;
1247 unsigned i;
1248 int count_ref = 0;
1249 rtx_insn *insn;
1250
1251 body = get_loop_body (loop);
1252 for (i = 0; i < loop->num_nodes; i++)
1253 {
1254 bb = body[i];
1255
1256 FOR_BB_INSNS (bb, insn)
1257 if (!rtx_referenced_p (reg, insn))
1258 continue;
1259 else if (DEBUG_INSN_P (insn))
1260 ++*debug_uses;
1261 else if (++count_ref > 1)
1262 break;
1263 }
1264 free (body);
1265 return (count_ref == 1);
1266 }
1267
1268 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1269
1270 static void
1271 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1272 {
1273 basic_block *body, bb;
1274 unsigned i;
1275 rtx_insn *insn;
1276
1277 body = get_loop_body (loop);
1278 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1279 {
1280 bb = body[i];
1281
1282 FOR_BB_INSNS (bb, insn)
1283 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1284 continue;
1285 else
1286 {
1287 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1288 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1289 if (!--debug_uses)
1290 break;
1291 }
1292 }
1293 free (body);
1294 }
1295
1296 /* Determine whether INSN contains an accumulator
1297 which can be expanded into separate copies,
1298 one for each copy of the LOOP body.
1299
1300 for (i = 0 ; i < n; i++)
1301 sum += a[i];
1302
1303 ==>
1304
1305 sum += a[i]
1306 ....
1307 i = i+1;
1308 sum1 += a[i]
1309 ....
1310 i = i+1
1311 sum2 += a[i];
1312 ....
1313
1314 Return NULL if INSN contains no opportunity for expansion of accumulator.
1315 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1316 information and return a pointer to it.
1317 */
1318
1319 static struct var_to_expand *
1320 analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn)
1321 {
1322 rtx set, dest, src;
1323 struct var_to_expand *ves;
1324 unsigned accum_pos;
1325 enum rtx_code code;
1326 int debug_uses = 0;
1327
1328 set = single_set (insn);
1329 if (!set)
1330 return NULL;
1331
1332 dest = SET_DEST (set);
1333 src = SET_SRC (set);
1334 code = GET_CODE (src);
1335
1336 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1337 return NULL;
1338
1339 if (FLOAT_MODE_P (GET_MODE (dest)))
1340 {
1341 if (!flag_associative_math)
1342 return NULL;
1343 /* In the case of FMA, we're also changing the rounding. */
1344 if (code == FMA && !flag_unsafe_math_optimizations)
1345 return NULL;
1346 }
1347
1348 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1349 in MD. But if there is no optab to generate the insn, we can not
1350 perform the variable expansion. This can happen if an MD provides
1351 an insn but not a named pattern to generate it, for example to avoid
1352 producing code that needs additional mode switches like for x87/mmx.
1353
1354 So we check have_insn_for which looks for an optab for the operation
1355 in SRC. If it doesn't exist, we can't perform the expansion even
1356 though INSN is valid. */
1357 if (!have_insn_for (code, GET_MODE (src)))
1358 return NULL;
1359
1360 if (!REG_P (dest)
1361 && !(GET_CODE (dest) == SUBREG
1362 && REG_P (SUBREG_REG (dest))))
1363 return NULL;
1364
1365 /* Find the accumulator use within the operation. */
1366 if (code == FMA)
1367 {
1368 /* We only support accumulation via FMA in the ADD position. */
1369 if (!rtx_equal_p (dest, XEXP (src, 2)))
1370 return NULL;
1371 accum_pos = 2;
1372 }
1373 else if (rtx_equal_p (dest, XEXP (src, 0)))
1374 accum_pos = 0;
1375 else if (rtx_equal_p (dest, XEXP (src, 1)))
1376 {
1377 /* The method of expansion that we are using; which includes the
1378 initialization of the expansions with zero and the summation of
1379 the expansions at the end of the computation will yield wrong
1380 results for (x = something - x) thus avoid using it in that case. */
1381 if (code == MINUS)
1382 return NULL;
1383 accum_pos = 1;
1384 }
1385 else
1386 return NULL;
1387
1388 /* It must not otherwise be used. */
1389 if (code == FMA)
1390 {
1391 if (rtx_referenced_p (dest, XEXP (src, 0))
1392 || rtx_referenced_p (dest, XEXP (src, 1)))
1393 return NULL;
1394 }
1395 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1396 return NULL;
1397
1398 /* It must be used in exactly one insn. */
1399 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1400 return NULL;
1401
1402 if (dump_file)
1403 {
1404 fprintf (dump_file, "\n;; Expanding Accumulator ");
1405 print_rtl (dump_file, dest);
1406 fprintf (dump_file, "\n");
1407 }
1408
1409 if (debug_uses)
1410 /* Instead of resetting the debug insns, we could replace each
1411 debug use in the loop with the sum or product of all expanded
1412 accummulators. Since we'll only know of all expansions at the
1413 end, we'd have to keep track of which vars_to_expand a debug
1414 insn in the loop references, take note of each copy of the
1415 debug insn during unrolling, and when it's all done, compute
1416 the sum or product of each variable and adjust the original
1417 debug insn and each copy thereof. What a pain! */
1418 reset_debug_uses_in_loop (loop, dest, debug_uses);
1419
1420 /* Record the accumulator to expand. */
1421 ves = XNEW (struct var_to_expand);
1422 ves->insn = insn;
1423 ves->reg = copy_rtx (dest);
1424 ves->var_expansions.create (1);
1425 ves->next = NULL;
1426 ves->op = GET_CODE (src);
1427 ves->expansion_count = 0;
1428 ves->reuse_expansion = 0;
1429 return ves;
1430 }
1431
1432 /* Determine whether there is an induction variable in INSN that
1433 we would like to split during unrolling.
1434
1435 I.e. replace
1436
1437 i = i + 1;
1438 ...
1439 i = i + 1;
1440 ...
1441 i = i + 1;
1442 ...
1443
1444 type chains by
1445
1446 i0 = i + 1
1447 ...
1448 i = i0 + 1
1449 ...
1450 i = i0 + 2
1451 ...
1452
1453 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1454 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1455 pointer to it. */
1456
1457 static struct iv_to_split *
1458 analyze_iv_to_split_insn (rtx_insn *insn)
1459 {
1460 rtx set, dest;
1461 struct rtx_iv iv;
1462 struct iv_to_split *ivts;
1463 bool ok;
1464
1465 /* For now we just split the basic induction variables. Later this may be
1466 extended for example by selecting also addresses of memory references. */
1467 set = single_set (insn);
1468 if (!set)
1469 return NULL;
1470
1471 dest = SET_DEST (set);
1472 if (!REG_P (dest))
1473 return NULL;
1474
1475 if (!biv_p (insn, dest))
1476 return NULL;
1477
1478 ok = iv_analyze_result (insn, dest, &iv);
1479
1480 /* This used to be an assert under the assumption that if biv_p returns
1481 true that iv_analyze_result must also return true. However, that
1482 assumption is not strictly correct as evidenced by pr25569.
1483
1484 Returning NULL when iv_analyze_result returns false is safe and
1485 avoids the problems in pr25569 until the iv_analyze_* routines
1486 can be fixed, which is apparently hard and time consuming
1487 according to their author. */
1488 if (! ok)
1489 return NULL;
1490
1491 if (iv.step == const0_rtx
1492 || iv.mode != iv.extend_mode)
1493 return NULL;
1494
1495 /* Record the insn to split. */
1496 ivts = XNEW (struct iv_to_split);
1497 ivts->insn = insn;
1498 ivts->orig_var = dest;
1499 ivts->base_var = NULL_RTX;
1500 ivts->step = iv.step;
1501 ivts->next = NULL;
1502
1503 return ivts;
1504 }
1505
1506 /* Determines which of insns in LOOP can be optimized.
1507 Return a OPT_INFO struct with the relevant hash tables filled
1508 with all insns to be optimized. The FIRST_NEW_BLOCK field
1509 is undefined for the return value. */
1510
1511 static struct opt_info *
1512 analyze_insns_in_loop (struct loop *loop)
1513 {
1514 basic_block *body, bb;
1515 unsigned i;
1516 struct opt_info *opt_info = XCNEW (struct opt_info);
1517 rtx_insn *insn;
1518 struct iv_to_split *ivts = NULL;
1519 struct var_to_expand *ves = NULL;
1520 iv_to_split **slot1;
1521 var_to_expand **slot2;
1522 vec<edge> edges = get_loop_exit_edges (loop);
1523 edge exit;
1524 bool can_apply = false;
1525
1526 iv_analysis_loop_init (loop);
1527
1528 body = get_loop_body (loop);
1529
1530 if (flag_split_ivs_in_unroller)
1531 {
1532 opt_info->insns_to_split
1533 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1534 opt_info->iv_to_split_head = NULL;
1535 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1536 }
1537
1538 /* Record the loop exit bb and loop preheader before the unrolling. */
1539 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1540
1541 if (edges.length () == 1)
1542 {
1543 exit = edges[0];
1544 if (!(exit->flags & EDGE_COMPLEX))
1545 {
1546 opt_info->loop_exit = split_edge (exit);
1547 can_apply = true;
1548 }
1549 }
1550
1551 if (flag_variable_expansion_in_unroller
1552 && can_apply)
1553 {
1554 opt_info->insns_with_var_to_expand
1555 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1556 opt_info->var_to_expand_head = NULL;
1557 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1558 }
1559
1560 for (i = 0; i < loop->num_nodes; i++)
1561 {
1562 bb = body[i];
1563 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1564 continue;
1565
1566 FOR_BB_INSNS (bb, insn)
1567 {
1568 if (!INSN_P (insn))
1569 continue;
1570
1571 if (opt_info->insns_to_split)
1572 ivts = analyze_iv_to_split_insn (insn);
1573
1574 if (ivts)
1575 {
1576 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1577 gcc_assert (*slot1 == NULL);
1578 *slot1 = ivts;
1579 *opt_info->iv_to_split_tail = ivts;
1580 opt_info->iv_to_split_tail = &ivts->next;
1581 continue;
1582 }
1583
1584 if (opt_info->insns_with_var_to_expand)
1585 ves = analyze_insn_to_expand_var (loop, insn);
1586
1587 if (ves)
1588 {
1589 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1590 gcc_assert (*slot2 == NULL);
1591 *slot2 = ves;
1592 *opt_info->var_to_expand_tail = ves;
1593 opt_info->var_to_expand_tail = &ves->next;
1594 }
1595 }
1596 }
1597
1598 edges.release ();
1599 free (body);
1600 return opt_info;
1601 }
1602
1603 /* Called just before loop duplication. Records start of duplicated area
1604 to OPT_INFO. */
1605
1606 static void
1607 opt_info_start_duplication (struct opt_info *opt_info)
1608 {
1609 if (opt_info)
1610 opt_info->first_new_block = last_basic_block_for_fn (cfun);
1611 }
1612
1613 /* Determine the number of iterations between initialization of the base
1614 variable and the current copy (N_COPY). N_COPIES is the total number
1615 of newly created copies. UNROLLING is true if we are unrolling
1616 (not peeling) the loop. */
1617
1618 static unsigned
1619 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1620 {
1621 if (unrolling)
1622 {
1623 /* If we are unrolling, initialization is done in the original loop
1624 body (number 0). */
1625 return n_copy;
1626 }
1627 else
1628 {
1629 /* If we are peeling, the copy in that the initialization occurs has
1630 number 1. The original loop (number 0) is the last. */
1631 if (n_copy)
1632 return n_copy - 1;
1633 else
1634 return n_copies;
1635 }
1636 }
1637
1638 /* Allocate basic variable for the induction variable chain. */
1639
1640 static void
1641 allocate_basic_variable (struct iv_to_split *ivts)
1642 {
1643 rtx expr = SET_SRC (single_set (ivts->insn));
1644
1645 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1646 }
1647
1648 /* Insert initialization of basic variable of IVTS before INSN, taking
1649 the initial value from INSN. */
1650
1651 static void
1652 insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1653 {
1654 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1655 rtx_insn *seq;
1656
1657 start_sequence ();
1658 expr = force_operand (expr, ivts->base_var);
1659 if (expr != ivts->base_var)
1660 emit_move_insn (ivts->base_var, expr);
1661 seq = get_insns ();
1662 end_sequence ();
1663
1664 emit_insn_before (seq, insn);
1665 }
1666
1667 /* Replace the use of induction variable described in IVTS in INSN
1668 by base variable + DELTA * step. */
1669
1670 static void
1671 split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1672 {
1673 rtx expr, *loc, incr, var;
1674 rtx_insn *seq;
1675 machine_mode mode = GET_MODE (ivts->base_var);
1676 rtx src, dest, set;
1677
1678 /* Construct base + DELTA * step. */
1679 if (!delta)
1680 expr = ivts->base_var;
1681 else
1682 {
1683 incr = simplify_gen_binary (MULT, mode,
1684 ivts->step, gen_int_mode (delta, mode));
1685 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1686 ivts->base_var, incr);
1687 }
1688
1689 /* Figure out where to do the replacement. */
1690 loc = &SET_SRC (single_set (insn));
1691
1692 /* If we can make the replacement right away, we're done. */
1693 if (validate_change (insn, loc, expr, 0))
1694 return;
1695
1696 /* Otherwise, force EXPR into a register and try again. */
1697 start_sequence ();
1698 var = gen_reg_rtx (mode);
1699 expr = force_operand (expr, var);
1700 if (expr != var)
1701 emit_move_insn (var, expr);
1702 seq = get_insns ();
1703 end_sequence ();
1704 emit_insn_before (seq, insn);
1705
1706 if (validate_change (insn, loc, var, 0))
1707 return;
1708
1709 /* The last chance. Try recreating the assignment in insn
1710 completely from scratch. */
1711 set = single_set (insn);
1712 gcc_assert (set);
1713
1714 start_sequence ();
1715 *loc = var;
1716 src = copy_rtx (SET_SRC (set));
1717 dest = copy_rtx (SET_DEST (set));
1718 src = force_operand (src, dest);
1719 if (src != dest)
1720 emit_move_insn (dest, src);
1721 seq = get_insns ();
1722 end_sequence ();
1723
1724 emit_insn_before (seq, insn);
1725 delete_insn (insn);
1726 }
1727
1728
1729 /* Return one expansion of the accumulator recorded in struct VE. */
1730
1731 static rtx
1732 get_expansion (struct var_to_expand *ve)
1733 {
1734 rtx reg;
1735
1736 if (ve->reuse_expansion == 0)
1737 reg = ve->reg;
1738 else
1739 reg = ve->var_expansions[ve->reuse_expansion - 1];
1740
1741 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1742 ve->reuse_expansion = 0;
1743 else
1744 ve->reuse_expansion++;
1745
1746 return reg;
1747 }
1748
1749
1750 /* Given INSN replace the uses of the accumulator recorded in VE
1751 with a new register. */
1752
1753 static void
1754 expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1755 {
1756 rtx new_reg, set;
1757 bool really_new_expansion = false;
1758
1759 set = single_set (insn);
1760 gcc_assert (set);
1761
1762 /* Generate a new register only if the expansion limit has not been
1763 reached. Else reuse an already existing expansion. */
1764 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1765 {
1766 really_new_expansion = true;
1767 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1768 }
1769 else
1770 new_reg = get_expansion (ve);
1771
1772 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1773 if (apply_change_group ())
1774 if (really_new_expansion)
1775 {
1776 ve->var_expansions.safe_push (new_reg);
1777 ve->expansion_count++;
1778 }
1779 }
1780
1781 /* Initialize the variable expansions in loop preheader. PLACE is the
1782 loop-preheader basic block where the initialization of the
1783 expansions should take place. The expansions are initialized with
1784 (-0) when the operation is plus or minus to honor sign zero. This
1785 way we can prevent cases where the sign of the final result is
1786 effected by the sign of the expansion. Here is an example to
1787 demonstrate this:
1788
1789 for (i = 0 ; i < n; i++)
1790 sum += something;
1791
1792 ==>
1793
1794 sum += something
1795 ....
1796 i = i+1;
1797 sum1 += something
1798 ....
1799 i = i+1
1800 sum2 += something;
1801 ....
1802
1803 When SUM is initialized with -zero and SOMETHING is also -zero; the
1804 final result of sum should be -zero thus the expansions sum1 and sum2
1805 should be initialized with -zero as well (otherwise we will get +zero
1806 as the final result). */
1807
1808 static void
1809 insert_var_expansion_initialization (struct var_to_expand *ve,
1810 basic_block place)
1811 {
1812 rtx_insn *seq;
1813 rtx var, zero_init;
1814 unsigned i;
1815 machine_mode mode = GET_MODE (ve->reg);
1816 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1817
1818 if (ve->var_expansions.length () == 0)
1819 return;
1820
1821 start_sequence ();
1822 switch (ve->op)
1823 {
1824 case FMA:
1825 /* Note that we only accumulate FMA via the ADD operand. */
1826 case PLUS:
1827 case MINUS:
1828 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1829 {
1830 if (honor_signed_zero_p)
1831 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1832 else
1833 zero_init = CONST0_RTX (mode);
1834 emit_move_insn (var, zero_init);
1835 }
1836 break;
1837
1838 case MULT:
1839 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1840 {
1841 zero_init = CONST1_RTX (GET_MODE (var));
1842 emit_move_insn (var, zero_init);
1843 }
1844 break;
1845
1846 default:
1847 gcc_unreachable ();
1848 }
1849
1850 seq = get_insns ();
1851 end_sequence ();
1852
1853 emit_insn_after (seq, BB_END (place));
1854 }
1855
1856 /* Combine the variable expansions at the loop exit. PLACE is the
1857 loop exit basic block where the summation of the expansions should
1858 take place. */
1859
1860 static void
1861 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1862 {
1863 rtx sum = ve->reg;
1864 rtx expr, var;
1865 rtx_insn *seq, *insn;
1866 unsigned i;
1867
1868 if (ve->var_expansions.length () == 0)
1869 return;
1870
1871 start_sequence ();
1872 switch (ve->op)
1873 {
1874 case FMA:
1875 /* Note that we only accumulate FMA via the ADD operand. */
1876 case PLUS:
1877 case MINUS:
1878 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1879 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1880 break;
1881
1882 case MULT:
1883 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1884 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1885 break;
1886
1887 default:
1888 gcc_unreachable ();
1889 }
1890
1891 expr = force_operand (sum, ve->reg);
1892 if (expr != ve->reg)
1893 emit_move_insn (ve->reg, expr);
1894 seq = get_insns ();
1895 end_sequence ();
1896
1897 insn = BB_HEAD (place);
1898 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1899 insn = NEXT_INSN (insn);
1900
1901 emit_insn_after (seq, insn);
1902 }
1903
1904 /* Strip away REG_EQUAL notes for IVs we're splitting.
1905
1906 Updating REG_EQUAL notes for IVs we split is tricky: We
1907 cannot tell until after unrolling, DF-rescanning, and liveness
1908 updating, whether an EQ_USE is reached by the split IV while
1909 the IV reg is still live. See PR55006.
1910
1911 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1912 because RTL loop-iv requires us to defer rescanning insns and
1913 any notes attached to them. So resort to old techniques... */
1914
1915 static void
1916 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1917 {
1918 struct iv_to_split *ivts;
1919 rtx note = find_reg_equal_equiv_note (insn);
1920 if (! note)
1921 return;
1922 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1923 if (reg_mentioned_p (ivts->orig_var, note))
1924 {
1925 remove_note (insn, note);
1926 return;
1927 }
1928 }
1929
1930 /* Apply loop optimizations in loop copies using the
1931 data which gathered during the unrolling. Structure
1932 OPT_INFO record that data.
1933
1934 UNROLLING is true if we unrolled (not peeled) the loop.
1935 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1936 the loop (as it should happen in complete unrolling, but not in ordinary
1937 peeling of the loop). */
1938
1939 static void
1940 apply_opt_in_copies (struct opt_info *opt_info,
1941 unsigned n_copies, bool unrolling,
1942 bool rewrite_original_loop)
1943 {
1944 unsigned i, delta;
1945 basic_block bb, orig_bb;
1946 rtx_insn *insn, *orig_insn, *next;
1947 struct iv_to_split ivts_templ, *ivts;
1948 struct var_to_expand ve_templ, *ves;
1949
1950 /* Sanity check -- we need to put initialization in the original loop
1951 body. */
1952 gcc_assert (!unrolling || rewrite_original_loop);
1953
1954 /* Allocate the basic variables (i0). */
1955 if (opt_info->insns_to_split)
1956 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1957 allocate_basic_variable (ivts);
1958
1959 for (i = opt_info->first_new_block;
1960 i < (unsigned) last_basic_block_for_fn (cfun);
1961 i++)
1962 {
1963 bb = BASIC_BLOCK_FOR_FN (cfun, i);
1964 orig_bb = get_bb_original (bb);
1965
1966 /* bb->aux holds position in copy sequence initialized by
1967 duplicate_loop_to_header_edge. */
1968 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
1969 unrolling);
1970 bb->aux = 0;
1971 orig_insn = BB_HEAD (orig_bb);
1972 FOR_BB_INSNS_SAFE (bb, insn, next)
1973 {
1974 if (!INSN_P (insn)
1975 || (DEBUG_INSN_P (insn)
1976 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
1977 continue;
1978
1979 while (!INSN_P (orig_insn)
1980 || (DEBUG_INSN_P (orig_insn)
1981 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
1982 == LABEL_DECL)))
1983 orig_insn = NEXT_INSN (orig_insn);
1984
1985 ivts_templ.insn = orig_insn;
1986 ve_templ.insn = orig_insn;
1987
1988 /* Apply splitting iv optimization. */
1989 if (opt_info->insns_to_split)
1990 {
1991 maybe_strip_eq_note_for_split_iv (opt_info, insn);
1992
1993 ivts = opt_info->insns_to_split->find (&ivts_templ);
1994
1995 if (ivts)
1996 {
1997 gcc_assert (GET_CODE (PATTERN (insn))
1998 == GET_CODE (PATTERN (orig_insn)));
1999
2000 if (!delta)
2001 insert_base_initialization (ivts, insn);
2002 split_iv (ivts, insn, delta);
2003 }
2004 }
2005 /* Apply variable expansion optimization. */
2006 if (unrolling && opt_info->insns_with_var_to_expand)
2007 {
2008 ves = (struct var_to_expand *)
2009 opt_info->insns_with_var_to_expand->find (&ve_templ);
2010 if (ves)
2011 {
2012 gcc_assert (GET_CODE (PATTERN (insn))
2013 == GET_CODE (PATTERN (orig_insn)));
2014 expand_var_during_unrolling (ves, insn);
2015 }
2016 }
2017 orig_insn = NEXT_INSN (orig_insn);
2018 }
2019 }
2020
2021 if (!rewrite_original_loop)
2022 return;
2023
2024 /* Initialize the variable expansions in the loop preheader
2025 and take care of combining them at the loop exit. */
2026 if (opt_info->insns_with_var_to_expand)
2027 {
2028 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2029 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2030 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2031 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2032 }
2033
2034 /* Rewrite also the original loop body. Find them as originals of the blocks
2035 in the last copied iteration, i.e. those that have
2036 get_bb_copy (get_bb_original (bb)) == bb. */
2037 for (i = opt_info->first_new_block;
2038 i < (unsigned) last_basic_block_for_fn (cfun);
2039 i++)
2040 {
2041 bb = BASIC_BLOCK_FOR_FN (cfun, i);
2042 orig_bb = get_bb_original (bb);
2043 if (get_bb_copy (orig_bb) != bb)
2044 continue;
2045
2046 delta = determine_split_iv_delta (0, n_copies, unrolling);
2047 for (orig_insn = BB_HEAD (orig_bb);
2048 orig_insn != NEXT_INSN (BB_END (bb));
2049 orig_insn = next)
2050 {
2051 next = NEXT_INSN (orig_insn);
2052
2053 if (!INSN_P (orig_insn))
2054 continue;
2055
2056 ivts_templ.insn = orig_insn;
2057 if (opt_info->insns_to_split)
2058 {
2059 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2060
2061 ivts = (struct iv_to_split *)
2062 opt_info->insns_to_split->find (&ivts_templ);
2063 if (ivts)
2064 {
2065 if (!delta)
2066 insert_base_initialization (ivts, orig_insn);
2067 split_iv (ivts, orig_insn, delta);
2068 continue;
2069 }
2070 }
2071
2072 }
2073 }
2074 }
2075
2076 /* Release OPT_INFO. */
2077
2078 static void
2079 free_opt_info (struct opt_info *opt_info)
2080 {
2081 delete opt_info->insns_to_split;
2082 opt_info->insns_to_split = NULL;
2083 if (opt_info->insns_with_var_to_expand)
2084 {
2085 struct var_to_expand *ves;
2086
2087 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2088 ves->var_expansions.release ();
2089 delete opt_info->insns_with_var_to_expand;
2090 opt_info->insns_with_var_to_expand = NULL;
2091 }
2092 free (opt_info);
2093 }