gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Hashtable helper for iv_to_split.  */
 107
 108 struct iv_split_hasher : typed_free_remove <iv_to_split>
 109 {
 110   typedef iv_to_split value_type;
 111   typedef iv_to_split compare_type;
 112   static inline hashval_t hash (const value_type *);
 113   static inline bool equal (const value_type *, const compare_type *);
 114 };
 115
 116
 117 /* A hash function for information about insns to split.  */
 118
 119 inline hashval_t
 120 iv_split_hasher::hash (const value_type *ivts)
 121 {
 122   return (hashval_t) INSN_UID (ivts->insn);
 123 }
 124
 125 /* An equality functions for information about insns to split.  */
 126
 127 inline bool
 128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 129 {
 130   return i1->insn == i2->insn;
 131 }
 132
 133 /* Hashtable helper for iv_to_split.  */
 134
 135 struct var_expand_hasher : typed_free_remove <var_to_expand>
 136 {
 137   typedef var_to_expand value_type;
 138   typedef var_to_expand compare_type;
 139   static inline hashval_t hash (const value_type *);
 140   static inline bool equal (const value_type *, const compare_type *);
 141 };
 142
 143 /* Return a hash for VES.  */
 144
 145 inline hashval_t
 146 var_expand_hasher::hash (const value_type *ves)
 147 {
 148   return (hashval_t) INSN_UID (ves->insn);
 149 }
 150
 151 /* Return true if I1 and I2 refer to the same instruction.  */
 152
 153 inline bool
 154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 155 {
 156   return i1->insn == i2->insn;
 157 }
 158
 159 /* Information about optimization applied in
 160    the unrolled loop.  */
 161
 162 struct opt_info
 163 {
 164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 165                                                   split.  */
 166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 169                                         insns with accumulators to expand.  */
 170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 172   unsigned first_new_block;        /* The first basic block that was
 173                                       duplicated.  */
 174   basic_block loop_exit;           /* The loop exit basic block.  */
 175   basic_block loop_preheader;      /* The loop preheader basic block.  */
 176 };
 177
 178 static void decide_unrolling_and_peeling (int);
 179 static void peel_loops_completely (int);
 180 static void decide_peel_simple (struct loop *, int);
 181 static void decide_peel_once_rolling (struct loop *, int);
 182 static void decide_peel_completely (struct loop *, int);
 183 static void decide_unroll_stupid (struct loop *, int);
 184 static void decide_unroll_constant_iterations (struct loop *, int);
 185 static void decide_unroll_runtime_iterations (struct loop *, int);
 186 static void peel_loop_simple (struct loop *);
 187 static void peel_loop_completely (struct loop *);
 188 static void unroll_loop_stupid (struct loop *);
 189 static void unroll_loop_constant_iterations (struct loop *);
 190 static void unroll_loop_runtime_iterations (struct loop *);
 191 static struct opt_info *analyze_insns_in_loop (struct loop *);
 192 static void opt_info_start_duplication (struct opt_info *);
 193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 194 static void free_opt_info (struct opt_info *);
 195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 199 static void insert_var_expansion_initialization (struct var_to_expand *,
 200                                                  basic_block);
 201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 202                                              basic_block);
 203 static rtx get_expansion (struct var_to_expand *);
 204
 205 /* Emit a message summarizing the unroll or peel that will be
 206    performed for LOOP, along with the loop's location LOCUS, if
 207    appropriate given the dump or -fopt-info settings.  */
 208
 209 static void
 210 report_unroll_peel (struct loop *loop, location_t locus)
 211 {
 212   struct niter_desc *desc;
 213   int niters = 0;
 214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 215
 216   if (loop->lpt_decision.decision == LPT_NONE)
 217     return;
 218
 219   if (!dump_enabled_p ())
 220     return;
 221
 222   /* In the special case where the loop never iterated, emit
 223      a different message so that we don't report an unroll by 0.
 224      This matches the equivalent message emitted during tree unrolling.  */
 225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 226       && !loop->lpt_decision.times)
 227     {
 228       dump_printf_loc (report_flags, locus,
 229                        "loop turned into non-loop; it never loops.\n");
 230       return;
 231     }
 232
 233   desc = get_simple_loop_desc (loop);
 234
 235   if (desc->const_iter)
 236     niters = desc->niter;
 237   else if (loop->header->count)
 238     niters = expected_loop_iterations (loop);
 239
 240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 241     dump_printf_loc (report_flags, locus,
 242                      "loop with %d iterations completely unrolled",
 243                      loop->lpt_decision.times + 1);
 244   else
 245     dump_printf_loc (report_flags, locus,
 246                      "loop %s %d times",
 247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 248                        ? "peeled" : "unrolled"),
 249                      loop->lpt_decision.times);
 250   if (profile_info)
 251     dump_printf (report_flags,
 252                  " (header execution count %d",
 253                  (int)loop->header->count);
 254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255     dump_printf (report_flags,
 256                  "%s%s iterations %d)",
 257                  profile_info ? ", " : " (",
 258                  desc->const_iter ? "const" : "average",
 259                  niters);
 260   else if (profile_info)
 261     dump_printf (report_flags, ")");
 262
 263   dump_printf (report_flags, "\n");
 264 }
 265
 266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 267 void
 268 unroll_and_peel_loops (int flags)
 269 {
 270   struct loop *loop;
 271   bool changed = false;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   bool changed = false;
 348
 349   /* Scan the loops, the inner ones first.  */
 350   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 351     {
 352       loop->lpt_decision.decision = LPT_NONE;
 353       location_t locus = get_loop_location (loop);
 354
 355       if (dump_enabled_p ())
 356         dump_printf_loc (TDF_RTL, locus,
 357                          ";; *** Considering loop %d at BB %d for "
 358                          "complete peeling ***\n",
 359                          loop->num, loop->header->index);
 360
 361       loop->ninsns = num_loop_insns (loop);
 362
 363       decide_peel_once_rolling (loop, flags);
 364       if (loop->lpt_decision.decision == LPT_NONE)
 365         decide_peel_completely (loop, flags);
 366
 367       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 368         {
 369           report_unroll_peel (loop, locus);
 370           peel_loop_completely (loop);
 371           changed = true;
 372         }
 373     }
 374
 375     if (changed)
 376       {
 377         calculate_dominance_info (CDI_DOMINATORS);
 378         fix_loop_structure (NULL);
 379       }
 380 }
 381
 382 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 383 static void
 384 decide_unrolling_and_peeling (int flags)
 385 {
 386   struct loop *loop;
 387
 388   /* Scan the loops, inner ones first.  */
 389   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 390     {
 391       loop->lpt_decision.decision = LPT_NONE;
 392       location_t locus = get_loop_location (loop);
 393
 394       if (dump_enabled_p ())
 395         dump_printf_loc (TDF_RTL, locus,
 396                          ";; *** Considering loop %d at BB %d for "
 397                          "unrolling and peeling ***\n",
 398                          loop->num, loop->header->index);
 399
 400       /* Do not peel cold areas.  */
 401       if (optimize_loop_for_size_p (loop))
 402         {
 403           if (dump_file)
 404             fprintf (dump_file, ";; Not considering loop, cold area\n");
 405           continue;
 406         }
 407
 408       /* Can the loop be manipulated?  */
 409       if (!can_duplicate_loop_p (loop))
 410         {
 411           if (dump_file)
 412             fprintf (dump_file,
 413                      ";; Not considering loop, cannot duplicate\n");
 414           continue;
 415         }
 416
 417       /* Skip non-innermost loops.  */
 418       if (loop->inner)
 419         {
 420           if (dump_file)
 421             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 422           continue;
 423         }
 424
 425       loop->ninsns = num_loop_insns (loop);
 426       loop->av_ninsns = average_num_loop_insns (loop);
 427
 428       /* Try transformations one by one in decreasing order of
 429          priority.  */
 430
 431       decide_unroll_constant_iterations (loop, flags);
 432       if (loop->lpt_decision.decision == LPT_NONE)
 433         decide_unroll_runtime_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_stupid (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_peel_simple (loop, flags);
 438
 439       report_unroll_peel (loop, locus);
 440     }
 441 }
 442
 443 /* Decide whether the LOOP is once rolling and suitable for complete
 444    peeling.  */
 445 static void
 446 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 447 {
 448   struct niter_desc *desc;
 449
 450   if (dump_file)
 451     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 452
 453   /* Is the loop small enough?  */
 454   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 455     {
 456       if (dump_file)
 457         fprintf (dump_file, ";; Not considering loop, is too big\n");
 458       return;
 459     }
 460
 461   /* Check for simple loops.  */
 462   desc = get_simple_loop_desc (loop);
 463
 464   /* Check number of iterations.  */
 465   if (!desc->simple_p
 466       || desc->assumptions
 467       || desc->infinite
 468       || !desc->const_iter
 469       || (desc->niter != 0
 470           && get_max_loop_iterations_int (loop) != 0))
 471     {
 472       if (dump_file)
 473         fprintf (dump_file,
 474                  ";; Unable to prove that the loop rolls exactly once\n");
 475       return;
 476     }
 477
 478   /* Success.  */
 479   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 480 }
 481
 482 /* Decide whether the LOOP is suitable for complete peeling.  */
 483 static void
 484 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 485 {
 486   unsigned npeel;
 487   struct niter_desc *desc;
 488
 489   if (dump_file)
 490     fprintf (dump_file, "\n;; Considering peeling completely\n");
 491
 492   /* Skip non-innermost loops.  */
 493   if (loop->inner)
 494     {
 495       if (dump_file)
 496         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 497       return;
 498     }
 499
 500   /* Do not peel cold areas.  */
 501   if (optimize_loop_for_size_p (loop))
 502     {
 503       if (dump_file)
 504         fprintf (dump_file, ";; Not considering loop, cold area\n");
 505       return;
 506     }
 507
 508   /* Can the loop be manipulated?  */
 509   if (!can_duplicate_loop_p (loop))
 510     {
 511       if (dump_file)
 512         fprintf (dump_file,
 513                  ";; Not considering loop, cannot duplicate\n");
 514       return;
 515     }
 516
 517   /* npeel = number of iterations to peel.  */
 518   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 519   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 520     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 521
 522   /* Is the loop small enough?  */
 523   if (!npeel)
 524     {
 525       if (dump_file)
 526         fprintf (dump_file, ";; Not considering loop, is too big\n");
 527       return;
 528     }
 529
 530   /* Check for simple loops.  */
 531   desc = get_simple_loop_desc (loop);
 532
 533   /* Check number of iterations.  */
 534   if (!desc->simple_p
 535       || desc->assumptions
 536       || !desc->const_iter
 537       || desc->infinite)
 538     {
 539       if (dump_file)
 540         fprintf (dump_file,
 541                  ";; Unable to prove that the loop iterates constant times\n");
 542       return;
 543     }
 544
 545   if (desc->niter > npeel - 1)
 546     {
 547       if (dump_file)
 548         {
 549           fprintf (dump_file,
 550                    ";; Not peeling loop completely, rolls too much (");
 551           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 552           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 553         }
 554       return;
 555     }
 556
 557   /* Success.  */
 558   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 559 }
 560
 561 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 562    completely.  The transformation done:
 563
 564    for (i = 0; i < 4; i++)
 565      body;
 566
 567    ==>
 568
 569    i = 0;
 570    body; i++;
 571    body; i++;
 572    body; i++;
 573    body; i++;
 574    */
 575 static void
 576 peel_loop_completely (struct loop *loop)
 577 {
 578   sbitmap wont_exit;
 579   unsigned HOST_WIDE_INT npeel;
 580   unsigned i;
 581   edge ein;
 582   struct niter_desc *desc = get_simple_loop_desc (loop);
 583   struct opt_info *opt_info = NULL;
 584
 585   npeel = desc->niter;
 586
 587   if (npeel)
 588     {
 589       bool ok;
 590
 591       wont_exit = sbitmap_alloc (npeel + 1);
 592       bitmap_ones (wont_exit);
 593       bitmap_clear_bit (wont_exit, 0);
 594       if (desc->noloop_assumptions)
 595         bitmap_clear_bit (wont_exit, 1);
 596
 597       auto_vec<edge> remove_edges;
 598       if (flag_split_ivs_in_unroller)
 599         opt_info = analyze_insns_in_loop (loop);
 600
 601       opt_info_start_duplication (opt_info);
 602       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 603                                           npeel,
 604                                           wont_exit, desc->out_edge,
 605                                           &remove_edges,
 606                                           DLTHE_FLAG_UPDATE_FREQ
 607                                           | DLTHE_FLAG_COMPLETTE_PEEL
 608                                           | (opt_info
 609                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 610       gcc_assert (ok);
 611
 612       free (wont_exit);
 613
 614       if (opt_info)
 615         {
 616           apply_opt_in_copies (opt_info, npeel, false, true);
 617           free_opt_info (opt_info);
 618         }
 619
 620       /* Remove the exit edges.  */
 621       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 622         remove_path (ein);
 623     }
 624
 625   ein = desc->in_edge;
 626   free_simple_loop_desc (loop);
 627
 628   /* Now remove the unreachable part of the last iteration and cancel
 629      the loop.  */
 630   remove_path (ein);
 631
 632   if (dump_file)
 633     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 634 }
 635
 636 /* Decide whether to unroll LOOP iterating constant number of times
 637    and how much.  */
 638
 639 static void
 640 decide_unroll_constant_iterations (struct loop *loop, int flags)
 641 {
 642   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 643   struct niter_desc *desc;
 644   double_int iterations;
 645
 646   if (!(flags & UAP_UNROLL))
 647     {
 648       /* We were not asked to, just return back silently.  */
 649       return;
 650     }
 651
 652   if (dump_file)
 653     fprintf (dump_file,
 654              "\n;; Considering unrolling loop with constant "
 655              "number of iterations\n");
 656
 657   /* nunroll = total number of copies of the original loop body in
 658      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 659   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 660   nunroll_by_av
 661     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 662   if (nunroll > nunroll_by_av)
 663     nunroll = nunroll_by_av;
 664   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 665     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 666
 667   /* Skip big loops.  */
 668   if (nunroll <= 1)
 669     {
 670       if (dump_file)
 671         fprintf (dump_file, ";; Not considering loop, is too big\n");
 672       return;
 673     }
 674
 675   /* Check for simple loops.  */
 676   desc = get_simple_loop_desc (loop);
 677
 678   /* Check number of iterations.  */
 679   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 680     {
 681       if (dump_file)
 682         fprintf (dump_file,
 683                  ";; Unable to prove that the loop iterates constant times\n");
 684       return;
 685     }
 686
 687   /* Check whether the loop rolls enough to consider.
 688      Consult also loop bounds and profile; in the case the loop has more
 689      than one exit it may well loop less than determined maximal number
 690      of iterations.  */
 691   if (desc->niter < 2 * nunroll
 692       || ((get_estimated_loop_iterations (loop, &iterations)
 693            || get_max_loop_iterations (loop, &iterations))
 694           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 695     {
 696       if (dump_file)
 697         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 698       return;
 699     }
 700
 701   /* Success; now compute number of iterations to unroll.  We alter
 702      nunroll so that as few as possible copies of loop body are
 703      necessary, while still not decreasing the number of unrollings
 704      too much (at most by 1).  */
 705   best_copies = 2 * nunroll + 10;
 706
 707   i = 2 * nunroll + 2;
 708   if (i - 1 >= desc->niter)
 709     i = desc->niter - 2;
 710
 711   for (; i >= nunroll - 1; i--)
 712     {
 713       unsigned exit_mod = desc->niter % (i + 1);
 714
 715       if (!loop_exit_at_end_p (loop))
 716         n_copies = exit_mod + i + 1;
 717       else if (exit_mod != (unsigned) i
 718                || desc->noloop_assumptions != NULL_RTX)
 719         n_copies = exit_mod + i + 2;
 720       else
 721         n_copies = i + 1;
 722
 723       if (n_copies < best_copies)
 724         {
 725           best_copies = n_copies;
 726           best_unroll = i;
 727         }
 728     }
 729
 730   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 731   loop->lpt_decision.times = best_unroll;
 732 }
 733
 734 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 735    The transformation does this:
 736
 737    for (i = 0; i < 102; i++)
 738      body;
 739
 740    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 741
 742    i = 0;
 743    body; i++;
 744    body; i++;
 745    while (i < 102)
 746      {
 747        body; i++;
 748        body; i++;
 749        body; i++;
 750        body; i++;
 751      }
 752   */
 753 static void
 754 unroll_loop_constant_iterations (struct loop *loop)
 755 {
 756   unsigned HOST_WIDE_INT niter;
 757   unsigned exit_mod;
 758   sbitmap wont_exit;
 759   unsigned i;
 760   edge e;
 761   unsigned max_unroll = loop->lpt_decision.times;
 762   struct niter_desc *desc = get_simple_loop_desc (loop);
 763   bool exit_at_end = loop_exit_at_end_p (loop);
 764   struct opt_info *opt_info = NULL;
 765   bool ok;
 766
 767   niter = desc->niter;
 768
 769   /* Should not get here (such loop should be peeled instead).  */
 770   gcc_assert (niter > max_unroll + 1);
 771
 772   exit_mod = niter % (max_unroll + 1);
 773
 774   wont_exit = sbitmap_alloc (max_unroll + 1);
 775   bitmap_ones (wont_exit);
 776
 777   auto_vec<edge> remove_edges;
 778   if (flag_split_ivs_in_unroller
 779       || flag_variable_expansion_in_unroller)
 780     opt_info = analyze_insns_in_loop (loop);
 781
 782   if (!exit_at_end)
 783     {
 784       /* The exit is not at the end of the loop; leave exit test
 785          in the first copy, so that the loops that start with test
 786          of exit condition have continuous body after unrolling.  */
 787
 788       if (dump_file)
 789         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 790
 791       /* Peel exit_mod iterations.  */
 792       bitmap_clear_bit (wont_exit, 0);
 793       if (desc->noloop_assumptions)
 794         bitmap_clear_bit (wont_exit, 1);
 795
 796       if (exit_mod)
 797         {
 798           opt_info_start_duplication (opt_info);
 799           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 800                                               exit_mod,
 801                                               wont_exit, desc->out_edge,
 802                                               &remove_edges,
 803                                               DLTHE_FLAG_UPDATE_FREQ
 804                                               | (opt_info && exit_mod > 1
 805                                                  ? DLTHE_RECORD_COPY_NUMBER
 806                                                    : 0));
 807           gcc_assert (ok);
 808
 809           if (opt_info && exit_mod > 1)
 810             apply_opt_in_copies (opt_info, exit_mod, false, false);
 811
 812           desc->noloop_assumptions = NULL_RTX;
 813           desc->niter -= exit_mod;
 814           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 815           if (loop->any_estimate
 816               && double_int::from_uhwi (exit_mod).ule
 817                    (loop->nb_iterations_estimate))
 818             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 819           else
 820             loop->any_estimate = false;
 821         }
 822
 823       bitmap_set_bit (wont_exit, 1);
 824     }
 825   else
 826     {
 827       /* Leave exit test in last copy, for the same reason as above if
 828          the loop tests the condition at the end of loop body.  */
 829
 830       if (dump_file)
 831         fprintf (dump_file, ";; Condition at end of loop.\n");
 832
 833       /* We know that niter >= max_unroll + 2; so we do not need to care of
 834          case when we would exit before reaching the loop.  So just peel
 835          exit_mod + 1 iterations.  */
 836       if (exit_mod != max_unroll
 837           || desc->noloop_assumptions)
 838         {
 839           bitmap_clear_bit (wont_exit, 0);
 840           if (desc->noloop_assumptions)
 841             bitmap_clear_bit (wont_exit, 1);
 842
 843           opt_info_start_duplication (opt_info);
 844           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 845                                               exit_mod + 1,
 846                                               wont_exit, desc->out_edge,
 847                                               &remove_edges,
 848                                               DLTHE_FLAG_UPDATE_FREQ
 849                                               | (opt_info && exit_mod > 0
 850                                                  ? DLTHE_RECORD_COPY_NUMBER
 851                                                    : 0));
 852           gcc_assert (ok);
 853
 854           if (opt_info && exit_mod > 0)
 855             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 856
 857           desc->niter -= exit_mod + 1;
 858           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 859           if (loop->any_estimate
 860               && double_int::from_uhwi (exit_mod + 1).ule
 861                    (loop->nb_iterations_estimate))
 862             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 863           else
 864             loop->any_estimate = false;
 865           desc->noloop_assumptions = NULL_RTX;
 866
 867           bitmap_set_bit (wont_exit, 0);
 868           bitmap_set_bit (wont_exit, 1);
 869         }
 870
 871       bitmap_clear_bit (wont_exit, max_unroll);
 872     }
 873
 874   /* Now unroll the loop.  */
 875
 876   opt_info_start_duplication (opt_info);
 877   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 878                                       max_unroll,
 879                                       wont_exit, desc->out_edge,
 880                                       &remove_edges,
 881                                       DLTHE_FLAG_UPDATE_FREQ
 882                                       | (opt_info
 883                                          ? DLTHE_RECORD_COPY_NUMBER
 884                                            : 0));
 885   gcc_assert (ok);
 886
 887   if (opt_info)
 888     {
 889       apply_opt_in_copies (opt_info, max_unroll, true, true);
 890       free_opt_info (opt_info);
 891     }
 892
 893   free (wont_exit);
 894
 895   if (exit_at_end)
 896     {
 897       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 898       /* Find a new in and out edge; they are in the last copy we have made.  */
 899
 900       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 901         {
 902           desc->out_edge = EDGE_SUCC (exit_block, 0);
 903           desc->in_edge = EDGE_SUCC (exit_block, 1);
 904         }
 905       else
 906         {
 907           desc->out_edge = EDGE_SUCC (exit_block, 1);
 908           desc->in_edge = EDGE_SUCC (exit_block, 0);
 909         }
 910     }
 911
 912   desc->niter /= max_unroll + 1;
 913   loop->nb_iterations_upper_bound
 914     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 915                                                                    + 1),
 916                                             TRUNC_DIV_EXPR);
 917   if (loop->any_estimate)
 918     loop->nb_iterations_estimate
 919       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 920                                                                   + 1),
 921                                            TRUNC_DIV_EXPR);
 922   desc->niter_expr = GEN_INT (desc->niter);
 923
 924   /* Remove the edges.  */
 925   FOR_EACH_VEC_ELT (remove_edges, i, e)
 926     remove_path (e);
 927
 928   if (dump_file)
 929     fprintf (dump_file,
 930              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 931              max_unroll, num_loop_insns (loop));
 932 }
 933
 934 /* Decide whether to unroll LOOP iterating runtime computable number of times
 935    and how much.  */
 936 static void
 937 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 938 {
 939   unsigned nunroll, nunroll_by_av, i;
 940   struct niter_desc *desc;
 941   double_int iterations;
 942
 943   if (!(flags & UAP_UNROLL))
 944     {
 945       /* We were not asked to, just return back silently.  */
 946       return;
 947     }
 948
 949   if (dump_file)
 950     fprintf (dump_file,
 951              "\n;; Considering unrolling loop with runtime "
 952              "computable number of iterations\n");
 953
 954   /* nunroll = total number of copies of the original loop body in
 955      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 956   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 957   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 958   if (nunroll > nunroll_by_av)
 959     nunroll = nunroll_by_av;
 960   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 961     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 962
 963   if (targetm.loop_unroll_adjust)
 964     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 965
 966   /* Skip big loops.  */
 967   if (nunroll <= 1)
 968     {
 969       if (dump_file)
 970         fprintf (dump_file, ";; Not considering loop, is too big\n");
 971       return;
 972     }
 973
 974   /* Check for simple loops.  */
 975   desc = get_simple_loop_desc (loop);
 976
 977   /* Check simpleness.  */
 978   if (!desc->simple_p || desc->assumptions)
 979     {
 980       if (dump_file)
 981         fprintf (dump_file,
 982                  ";; Unable to prove that the number of iterations "
 983                  "can be counted in runtime\n");
 984       return;
 985     }
 986
 987   if (desc->const_iter)
 988     {
 989       if (dump_file)
 990         fprintf (dump_file, ";; Loop iterates constant times\n");
 991       return;
 992     }
 993
 994   /* Check whether the loop rolls.  */
 995   if ((get_estimated_loop_iterations (loop, &iterations)
 996        || get_max_loop_iterations (loop, &iterations))
 997       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 998     {
 999       if (dump_file)
1000         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1001       return;
1002     }
1003
1004   /* Success; now force nunroll to be power of 2, as we are unable to
1005      cope with overflows in computation of number of iterations.  */
1006   for (i = 1; 2 * i <= nunroll; i *= 2)
1007     continue;
1008
1009   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1010   loop->lpt_decision.times = i - 1;
1011 }
1012
1013 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1014    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1015    and NULL is returned instead.  */
1016
1017 basic_block
1018 split_edge_and_insert (edge e, rtx insns)
1019 {
1020   basic_block bb;
1021
1022   if (!insns)
1023     return NULL;
1024   bb = split_edge (e);
1025   emit_insn_after (insns, BB_END (bb));
1026
1027   /* ??? We used to assume that INSNS can contain control flow insns, and
1028      that we had to try to find sub basic blocks in BB to maintain a valid
1029      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1030      and call break_superblocks when going out of cfglayout mode.  But it
1031      turns out that this never happens; and that if it does ever happen,
1032      the TODO_verify_flow at the end of the RTL loop passes would fail.
1033
1034      There are two reasons why we expected we could have control flow insns
1035      in INSNS.  The first is when a comparison has to be done in parts, and
1036      the second is when the number of iterations is computed for loops with
1037      the number of iterations known at runtime.  In both cases, test cases
1038      to get control flow in INSNS appear to be impossible to construct:
1039
1040       * If do_compare_rtx_and_jump needs several branches to do comparison
1041         in a mode that needs comparison by parts, we cannot analyze the
1042         number of iterations of the loop, and we never get to unrolling it.
1043
1044       * The code in expand_divmod that was suspected to cause creation of
1045         branching code seems to be only accessed for signed division.  The
1046         divisions used by # of iterations analysis are always unsigned.
1047         Problems might arise on architectures that emits branching code
1048         for some operations that may appear in the unroller (especially
1049         for division), but we have no such architectures.
1050
1051      Considering all this, it was decided that we should for now assume
1052      that INSNS can in theory contain control flow insns, but in practice
1053      it never does.  So we don't handle the theoretical case, and should
1054      a real failure ever show up, we have a pretty good clue for how to
1055      fix it.  */
1056
1057   return bb;
1058 }
1059
1060 /* Unroll LOOP for which we are able to count number of iterations in runtime
1061    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1062    extra care for case n < 0):
1063
1064    for (i = 0; i < n; i++)
1065      body;
1066
1067    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1068
1069    i = 0;
1070    mod = n % 4;
1071
1072    switch (mod)
1073      {
1074        case 3:
1075          body; i++;
1076        case 2:
1077          body; i++;
1078        case 1:
1079          body; i++;
1080        case 0: ;
1081      }
1082
1083    while (i < n)
1084      {
1085        body; i++;
1086        body; i++;
1087        body; i++;
1088        body; i++;
1089      }
1090    */
1091 static void
1092 unroll_loop_runtime_iterations (struct loop *loop)
1093 {
1094   rtx old_niter, niter, init_code, branch_code, tmp;
1095   unsigned i, j, p;
1096   basic_block preheader, *body, swtch, ezc_swtch;
1097   sbitmap wont_exit;
1098   int may_exit_copy;
1099   unsigned n_peel;
1100   edge e;
1101   bool extra_zero_check, last_may_exit;
1102   unsigned max_unroll = loop->lpt_decision.times;
1103   struct niter_desc *desc = get_simple_loop_desc (loop);
1104   bool exit_at_end = loop_exit_at_end_p (loop);
1105   struct opt_info *opt_info = NULL;
1106   bool ok;
1107
1108   if (flag_split_ivs_in_unroller
1109       || flag_variable_expansion_in_unroller)
1110     opt_info = analyze_insns_in_loop (loop);
1111
1112   /* Remember blocks whose dominators will have to be updated.  */
1113   auto_vec<basic_block> dom_bbs;
1114
1115   body = get_loop_body (loop);
1116   for (i = 0; i < loop->num_nodes; i++)
1117     {
1118       vec<basic_block> ldom;
1119       basic_block bb;
1120
1121       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1122       FOR_EACH_VEC_ELT (ldom, j, bb)
1123         if (!flow_bb_inside_loop_p (loop, bb))
1124           dom_bbs.safe_push (bb);
1125
1126       ldom.release ();
1127     }
1128   free (body);
1129
1130   if (!exit_at_end)
1131     {
1132       /* Leave exit in first copy (for explanation why see comment in
1133          unroll_loop_constant_iterations).  */
1134       may_exit_copy = 0;
1135       n_peel = max_unroll - 1;
1136       extra_zero_check = true;
1137       last_may_exit = false;
1138     }
1139   else
1140     {
1141       /* Leave exit in last copy (for explanation why see comment in
1142          unroll_loop_constant_iterations).  */
1143       may_exit_copy = max_unroll;
1144       n_peel = max_unroll;
1145       extra_zero_check = false;
1146       last_may_exit = true;
1147     }
1148
1149   /* Get expression for number of iterations.  */
1150   start_sequence ();
1151   old_niter = niter = gen_reg_rtx (desc->mode);
1152   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1153   if (tmp != niter)
1154     emit_move_insn (niter, tmp);
1155
1156   /* Count modulo by ANDing it with max_unroll; we use the fact that
1157      the number of unrollings is a power of two, and thus this is correct
1158      even if there is overflow in the computation.  */
1159   niter = expand_simple_binop (desc->mode, AND,
1160                                niter, gen_int_mode (max_unroll, desc->mode),
1161                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1162
1163   init_code = get_insns ();
1164   end_sequence ();
1165   unshare_all_rtl_in_chain (init_code);
1166
1167   /* Precondition the loop.  */
1168   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1169
1170   auto_vec<edge> remove_edges;
1171
1172   wont_exit = sbitmap_alloc (max_unroll + 2);
1173
1174   /* Peel the first copy of loop body (almost always we must leave exit test
1175      here; the only exception is when we have extra zero check and the number
1176      of iterations is reliable.  Also record the place of (possible) extra
1177      zero check.  */
1178   bitmap_clear (wont_exit);
1179   if (extra_zero_check
1180       && !desc->noloop_assumptions)
1181     bitmap_set_bit (wont_exit, 1);
1182   ezc_swtch = loop_preheader_edge (loop)->src;
1183   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1184                                       1, wont_exit, desc->out_edge,
1185                                       &remove_edges,
1186                                       DLTHE_FLAG_UPDATE_FREQ);
1187   gcc_assert (ok);
1188
1189   /* Record the place where switch will be built for preconditioning.  */
1190   swtch = split_edge (loop_preheader_edge (loop));
1191
1192   for (i = 0; i < n_peel; i++)
1193     {
1194       /* Peel the copy.  */
1195       bitmap_clear (wont_exit);
1196       if (i != n_peel - 1 || !last_may_exit)
1197         bitmap_set_bit (wont_exit, 1);
1198       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1199                                           1, wont_exit, desc->out_edge,
1200                                           &remove_edges,
1201                                           DLTHE_FLAG_UPDATE_FREQ);
1202       gcc_assert (ok);
1203
1204       /* Create item for switch.  */
1205       j = n_peel - i - (extra_zero_check ? 0 : 1);
1206       p = REG_BR_PROB_BASE / (i + 2);
1207
1208       preheader = split_edge (loop_preheader_edge (loop));
1209       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1210                                           block_label (preheader), p,
1211                                           NULL_RTX);
1212
1213       /* We rely on the fact that the compare and jump cannot be optimized out,
1214          and hence the cfg we create is correct.  */
1215       gcc_assert (branch_code != NULL_RTX);
1216
1217       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1218       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1219       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1220       e = make_edge (swtch, preheader,
1221                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1222       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1223       e->probability = p;
1224     }
1225
1226   if (extra_zero_check)
1227     {
1228       /* Add branch for zero iterations.  */
1229       p = REG_BR_PROB_BASE / (max_unroll + 1);
1230       swtch = ezc_swtch;
1231       preheader = split_edge (loop_preheader_edge (loop));
1232       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1233                                           block_label (preheader), p,
1234                                           NULL_RTX);
1235       gcc_assert (branch_code != NULL_RTX);
1236
1237       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1238       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1239       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1240       e = make_edge (swtch, preheader,
1241                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1242       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1243       e->probability = p;
1244     }
1245
1246   /* Recount dominators for outer blocks.  */
1247   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1248
1249   /* And unroll loop.  */
1250
1251   bitmap_ones (wont_exit);
1252   bitmap_clear_bit (wont_exit, may_exit_copy);
1253   opt_info_start_duplication (opt_info);
1254
1255   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1256                                       max_unroll,
1257                                       wont_exit, desc->out_edge,
1258                                       &remove_edges,
1259                                       DLTHE_FLAG_UPDATE_FREQ
1260                                       | (opt_info
1261                                          ? DLTHE_RECORD_COPY_NUMBER
1262                                            : 0));
1263   gcc_assert (ok);
1264
1265   if (opt_info)
1266     {
1267       apply_opt_in_copies (opt_info, max_unroll, true, true);
1268       free_opt_info (opt_info);
1269     }
1270
1271   free (wont_exit);
1272
1273   if (exit_at_end)
1274     {
1275       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1276       /* Find a new in and out edge; they are in the last copy we have
1277          made.  */
1278
1279       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1280         {
1281           desc->out_edge = EDGE_SUCC (exit_block, 0);
1282           desc->in_edge = EDGE_SUCC (exit_block, 1);
1283         }
1284       else
1285         {
1286           desc->out_edge = EDGE_SUCC (exit_block, 1);
1287           desc->in_edge = EDGE_SUCC (exit_block, 0);
1288         }
1289     }
1290
1291   /* Remove the edges.  */
1292   FOR_EACH_VEC_ELT (remove_edges, i, e)
1293     remove_path (e);
1294
1295   /* We must be careful when updating the number of iterations due to
1296      preconditioning and the fact that the value must be valid at entry
1297      of the loop.  After passing through the above code, we see that
1298      the correct new number of iterations is this:  */
1299   gcc_assert (!desc->const_iter);
1300   desc->niter_expr =
1301     simplify_gen_binary (UDIV, desc->mode, old_niter,
1302                          gen_int_mode (max_unroll + 1, desc->mode));
1303   loop->nb_iterations_upper_bound
1304     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1305                                                                    + 1),
1306                                             TRUNC_DIV_EXPR);
1307   if (loop->any_estimate)
1308     loop->nb_iterations_estimate
1309       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1310                                                                   + 1),
1311                                            TRUNC_DIV_EXPR);
1312   if (exit_at_end)
1313     {
1314       desc->niter_expr =
1315         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1316       desc->noloop_assumptions = NULL_RTX;
1317       --loop->nb_iterations_upper_bound;
1318       if (loop->any_estimate
1319           && loop->nb_iterations_estimate != double_int_zero)
1320         --loop->nb_iterations_estimate;
1321       else
1322         loop->any_estimate = false;
1323     }
1324
1325   if (dump_file)
1326     fprintf (dump_file,
1327              ";; Unrolled loop %d times, counting # of iterations "
1328              "in runtime, %i insns\n",
1329              max_unroll, num_loop_insns (loop));
1330 }
1331
1332 /* Decide whether to simply peel LOOP and how much.  */
1333 static void
1334 decide_peel_simple (struct loop *loop, int flags)
1335 {
1336   unsigned npeel;
1337   double_int iterations;
1338
1339   if (!(flags & UAP_PEEL))
1340     {
1341       /* We were not asked to, just return back silently.  */
1342       return;
1343     }
1344
1345   if (dump_file)
1346     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1347
1348   /* npeel = number of iterations to peel.  */
1349   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1350   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1351     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1352
1353   /* Skip big loops.  */
1354   if (!npeel)
1355     {
1356       if (dump_file)
1357         fprintf (dump_file, ";; Not considering loop, is too big\n");
1358       return;
1359     }
1360
1361   /* Do not simply peel loops with branches inside -- it increases number
1362      of mispredicts.
1363      Exception is when we do have profile and we however have good chance
1364      to peel proper number of iterations loop will iterate in practice.
1365      TODO: this heuristic needs tunning; while for complette unrolling
1366      the branch inside loop mostly eliminates any improvements, for
1367      peeling it is not the case.  Also a function call inside loop is
1368      also branch from branch prediction POV (and probably better reason
1369      to not unroll/peel).  */
1370   if (num_loop_branches (loop) > 1
1371       && profile_status != PROFILE_READ)
1372     {
1373       if (dump_file)
1374         fprintf (dump_file, ";; Not peeling, contains branches\n");
1375       return;
1376     }
1377
1378   /* If we have realistic estimate on number of iterations, use it.  */
1379   if (get_estimated_loop_iterations (loop, &iterations))
1380     {
1381       if (double_int::from_shwi (npeel).ule (iterations))
1382         {
1383           if (dump_file)
1384             {
1385               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1386               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1387                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1388               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1389                        npeel);
1390             }
1391           return;
1392         }
1393       npeel = iterations.to_shwi () + 1;
1394     }
1395   /* If we have small enough bound on iterations, we can still peel (completely
1396      unroll).  */
1397   else if (get_max_loop_iterations (loop, &iterations)
1398            && iterations.ult (double_int::from_shwi (npeel)))
1399     npeel = iterations.to_shwi () + 1;
1400   else
1401     {
1402       /* For now we have no good heuristics to decide whether loop peeling
1403          will be effective, so disable it.  */
1404       if (dump_file)
1405         fprintf (dump_file,
1406                  ";; Not peeling loop, no evidence it will be profitable\n");
1407       return;
1408     }
1409
1410   /* Success.  */
1411   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1412   loop->lpt_decision.times = npeel;
1413 }
1414
1415 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1416
1417    while (cond)
1418      body;
1419
1420    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1421
1422    if (!cond) goto end;
1423    body;
1424    if (!cond) goto end;
1425    body;
1426    if (!cond) goto end;
1427    body;
1428    while (cond)
1429      body;
1430    end: ;
1431    */
1432 static void
1433 peel_loop_simple (struct loop *loop)
1434 {
1435   sbitmap wont_exit;
1436   unsigned npeel = loop->lpt_decision.times;
1437   struct niter_desc *desc = get_simple_loop_desc (loop);
1438   struct opt_info *opt_info = NULL;
1439   bool ok;
1440
1441   if (flag_split_ivs_in_unroller && npeel > 1)
1442     opt_info = analyze_insns_in_loop (loop);
1443
1444   wont_exit = sbitmap_alloc (npeel + 1);
1445   bitmap_clear (wont_exit);
1446
1447   opt_info_start_duplication (opt_info);
1448
1449   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1450                                       npeel, wont_exit, NULL,
1451                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1452                                       | (opt_info
1453                                          ? DLTHE_RECORD_COPY_NUMBER
1454                                            : 0));
1455   gcc_assert (ok);
1456
1457   free (wont_exit);
1458
1459   if (opt_info)
1460     {
1461       apply_opt_in_copies (opt_info, npeel, false, false);
1462       free_opt_info (opt_info);
1463     }
1464
1465   if (desc->simple_p)
1466     {
1467       if (desc->const_iter)
1468         {
1469           desc->niter -= npeel;
1470           desc->niter_expr = GEN_INT (desc->niter);
1471           desc->noloop_assumptions = NULL_RTX;
1472         }
1473       else
1474         {
1475           /* We cannot just update niter_expr, as its value might be clobbered
1476              inside loop.  We could handle this by counting the number into
1477              temporary just like we do in runtime unrolling, but it does not
1478              seem worthwhile.  */
1479           free_simple_loop_desc (loop);
1480         }
1481     }
1482   if (dump_file)
1483     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1484 }
1485
1486 /* Decide whether to unroll LOOP stupidly and how much.  */
1487 static void
1488 decide_unroll_stupid (struct loop *loop, int flags)
1489 {
1490   unsigned nunroll, nunroll_by_av, i;
1491   struct niter_desc *desc;
1492   double_int iterations;
1493
1494   if (!(flags & UAP_UNROLL_ALL))
1495     {
1496       /* We were not asked to, just return back silently.  */
1497       return;
1498     }
1499
1500   if (dump_file)
1501     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1502
1503   /* nunroll = total number of copies of the original loop body in
1504      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1505   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1506   nunroll_by_av
1507     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1508   if (nunroll > nunroll_by_av)
1509     nunroll = nunroll_by_av;
1510   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1511     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1512
1513   if (targetm.loop_unroll_adjust)
1514     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1515
1516   /* Skip big loops.  */
1517   if (nunroll <= 1)
1518     {
1519       if (dump_file)
1520         fprintf (dump_file, ";; Not considering loop, is too big\n");
1521       return;
1522     }
1523
1524   /* Check for simple loops.  */
1525   desc = get_simple_loop_desc (loop);
1526
1527   /* Check simpleness.  */
1528   if (desc->simple_p && !desc->assumptions)
1529     {
1530       if (dump_file)
1531         fprintf (dump_file, ";; The loop is simple\n");
1532       return;
1533     }
1534
1535   /* Do not unroll loops with branches inside -- it increases number
1536      of mispredicts.
1537      TODO: this heuristic needs tunning; call inside the loop body
1538      is also relatively good reason to not unroll.  */
1539   if (num_loop_branches (loop) > 1)
1540     {
1541       if (dump_file)
1542         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1543       return;
1544     }
1545
1546   /* Check whether the loop rolls.  */
1547   if ((get_estimated_loop_iterations (loop, &iterations)
1548        || get_max_loop_iterations (loop, &iterations))
1549       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1550     {
1551       if (dump_file)
1552         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1553       return;
1554     }
1555
1556   /* Success.  Now force nunroll to be power of 2, as it seems that this
1557      improves results (partially because of better alignments, partially
1558      because of some dark magic).  */
1559   for (i = 1; 2 * i <= nunroll; i *= 2)
1560     continue;
1561
1562   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1563   loop->lpt_decision.times = i - 1;
1564 }
1565
1566 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1567
1568    while (cond)
1569      body;
1570
1571    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1572
1573    while (cond)
1574      {
1575        body;
1576        if (!cond) break;
1577        body;
1578        if (!cond) break;
1579        body;
1580        if (!cond) break;
1581        body;
1582      }
1583    */
1584 static void
1585 unroll_loop_stupid (struct loop *loop)
1586 {
1587   sbitmap wont_exit;
1588   unsigned nunroll = loop->lpt_decision.times;
1589   struct niter_desc *desc = get_simple_loop_desc (loop);
1590   struct opt_info *opt_info = NULL;
1591   bool ok;
1592
1593   if (flag_split_ivs_in_unroller
1594       || flag_variable_expansion_in_unroller)
1595     opt_info = analyze_insns_in_loop (loop);
1596
1597
1598   wont_exit = sbitmap_alloc (nunroll + 1);
1599   bitmap_clear (wont_exit);
1600   opt_info_start_duplication (opt_info);
1601
1602   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1603                                       nunroll, wont_exit,
1604                                       NULL, NULL,
1605                                       DLTHE_FLAG_UPDATE_FREQ
1606                                       | (opt_info
1607                                          ? DLTHE_RECORD_COPY_NUMBER
1608                                            : 0));
1609   gcc_assert (ok);
1610
1611   if (opt_info)
1612     {
1613       apply_opt_in_copies (opt_info, nunroll, true, true);
1614       free_opt_info (opt_info);
1615     }
1616
1617   free (wont_exit);
1618
1619   if (desc->simple_p)
1620     {
1621       /* We indeed may get here provided that there are nontrivial assumptions
1622          for a loop to be really simple.  We could update the counts, but the
1623          problem is that we are unable to decide which exit will be taken
1624          (not really true in case the number of iterations is constant,
1625          but no one will do anything with this information, so we do not
1626          worry about it).  */
1627       desc->simple_p = false;
1628     }
1629
1630   if (dump_file)
1631     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1632              nunroll, num_loop_insns (loop));
1633 }
1634
1635 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1636    Set *DEBUG_USES to the number of debug insns that reference the
1637    variable.  */
1638
1639 bool
1640 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1641                                   int *debug_uses)
1642 {
1643   basic_block *body, bb;
1644   unsigned i;
1645   int count_ref = 0;
1646   rtx insn;
1647
1648   body = get_loop_body (loop);
1649   for (i = 0; i < loop->num_nodes; i++)
1650     {
1651       bb = body[i];
1652
1653       FOR_BB_INSNS (bb, insn)
1654         if (!rtx_referenced_p (reg, insn))
1655           continue;
1656         else if (DEBUG_INSN_P (insn))
1657           ++*debug_uses;
1658         else if (++count_ref > 1)
1659           break;
1660     }
1661   free (body);
1662   return (count_ref  == 1);
1663 }
1664
1665 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1666
1667 static void
1668 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1669 {
1670   basic_block *body, bb;
1671   unsigned i;
1672   rtx insn;
1673
1674   body = get_loop_body (loop);
1675   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1676     {
1677       bb = body[i];
1678
1679       FOR_BB_INSNS (bb, insn)
1680         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1681           continue;
1682         else
1683           {
1684             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1685                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1686             if (!--debug_uses)
1687               break;
1688           }
1689     }
1690   free (body);
1691 }
1692
1693 /* Determine whether INSN contains an accumulator
1694    which can be expanded into separate copies,
1695    one for each copy of the LOOP body.
1696
1697    for (i = 0 ; i < n; i++)
1698      sum += a[i];
1699
1700    ==>
1701
1702    sum += a[i]
1703    ....
1704    i = i+1;
1705    sum1 += a[i]
1706    ....
1707    i = i+1
1708    sum2 += a[i];
1709    ....
1710
1711    Return NULL if INSN contains no opportunity for expansion of accumulator.
1712    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1713    information and return a pointer to it.
1714 */
1715
1716 static struct var_to_expand *
1717 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1718 {
1719   rtx set, dest, src;
1720   struct var_to_expand *ves;
1721   unsigned accum_pos;
1722   enum rtx_code code;
1723   int debug_uses = 0;
1724
1725   set = single_set (insn);
1726   if (!set)
1727     return NULL;
1728
1729   dest = SET_DEST (set);
1730   src = SET_SRC (set);
1731   code = GET_CODE (src);
1732
1733   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1734     return NULL;
1735
1736   if (FLOAT_MODE_P (GET_MODE (dest)))
1737     {
1738       if (!flag_associative_math)
1739         return NULL;
1740       /* In the case of FMA, we're also changing the rounding.  */
1741       if (code == FMA && !flag_unsafe_math_optimizations)
1742         return NULL;
1743     }
1744
1745   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1746      in MD.  But if there is no optab to generate the insn, we can not
1747      perform the variable expansion.  This can happen if an MD provides
1748      an insn but not a named pattern to generate it, for example to avoid
1749      producing code that needs additional mode switches like for x87/mmx.
1750
1751      So we check have_insn_for which looks for an optab for the operation
1752      in SRC.  If it doesn't exist, we can't perform the expansion even
1753      though INSN is valid.  */
1754   if (!have_insn_for (code, GET_MODE (src)))
1755     return NULL;
1756
1757   if (!REG_P (dest)
1758       && !(GET_CODE (dest) == SUBREG
1759            && REG_P (SUBREG_REG (dest))))
1760     return NULL;
1761
1762   /* Find the accumulator use within the operation.  */
1763   if (code == FMA)
1764     {
1765       /* We only support accumulation via FMA in the ADD position.  */
1766       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1767         return NULL;
1768       accum_pos = 2;
1769     }
1770   else if (rtx_equal_p (dest, XEXP (src, 0)))
1771     accum_pos = 0;
1772   else if (rtx_equal_p (dest, XEXP (src, 1)))
1773     {
1774       /* The method of expansion that we are using; which includes the
1775          initialization of the expansions with zero and the summation of
1776          the expansions at the end of the computation will yield wrong
1777          results for (x = something - x) thus avoid using it in that case.  */
1778       if (code == MINUS)
1779         return NULL;
1780       accum_pos = 1;
1781     }
1782   else
1783     return NULL;
1784
1785   /* It must not otherwise be used.  */
1786   if (code == FMA)
1787     {
1788       if (rtx_referenced_p (dest, XEXP (src, 0))
1789           || rtx_referenced_p (dest, XEXP (src, 1)))
1790         return NULL;
1791     }
1792   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1793     return NULL;
1794
1795   /* It must be used in exactly one insn.  */
1796   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1797     return NULL;
1798
1799   if (dump_file)
1800     {
1801       fprintf (dump_file, "\n;; Expanding Accumulator ");
1802       print_rtl (dump_file, dest);
1803       fprintf (dump_file, "\n");
1804     }
1805
1806   if (debug_uses)
1807     /* Instead of resetting the debug insns, we could replace each
1808        debug use in the loop with the sum or product of all expanded
1809        accummulators.  Since we'll only know of all expansions at the
1810        end, we'd have to keep track of which vars_to_expand a debug
1811        insn in the loop references, take note of each copy of the
1812        debug insn during unrolling, and when it's all done, compute
1813        the sum or product of each variable and adjust the original
1814        debug insn and each copy thereof.  What a pain!  */
1815     reset_debug_uses_in_loop (loop, dest, debug_uses);
1816
1817   /* Record the accumulator to expand.  */
1818   ves = XNEW (struct var_to_expand);
1819   ves->insn = insn;
1820   ves->reg = copy_rtx (dest);
1821   ves->var_expansions.create (1);
1822   ves->next = NULL;
1823   ves->op = GET_CODE (src);
1824   ves->expansion_count = 0;
1825   ves->reuse_expansion = 0;
1826   return ves;
1827 }
1828
1829 /* Determine whether there is an induction variable in INSN that
1830    we would like to split during unrolling.
1831
1832    I.e. replace
1833
1834    i = i + 1;
1835    ...
1836    i = i + 1;
1837    ...
1838    i = i + 1;
1839    ...
1840
1841    type chains by
1842
1843    i0 = i + 1
1844    ...
1845    i = i0 + 1
1846    ...
1847    i = i0 + 2
1848    ...
1849
1850    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1851    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1852    pointer to it.  */
1853
1854 static struct iv_to_split *
1855 analyze_iv_to_split_insn (rtx insn)
1856 {
1857   rtx set, dest;
1858   struct rtx_iv iv;
1859   struct iv_to_split *ivts;
1860   bool ok;
1861
1862   /* For now we just split the basic induction variables.  Later this may be
1863      extended for example by selecting also addresses of memory references.  */
1864   set = single_set (insn);
1865   if (!set)
1866     return NULL;
1867
1868   dest = SET_DEST (set);
1869   if (!REG_P (dest))
1870     return NULL;
1871
1872   if (!biv_p (insn, dest))
1873     return NULL;
1874
1875   ok = iv_analyze_result (insn, dest, &iv);
1876
1877   /* This used to be an assert under the assumption that if biv_p returns
1878      true that iv_analyze_result must also return true.  However, that
1879      assumption is not strictly correct as evidenced by pr25569.
1880
1881      Returning NULL when iv_analyze_result returns false is safe and
1882      avoids the problems in pr25569 until the iv_analyze_* routines
1883      can be fixed, which is apparently hard and time consuming
1884      according to their author.  */
1885   if (! ok)
1886     return NULL;
1887
1888   if (iv.step == const0_rtx
1889       || iv.mode != iv.extend_mode)
1890     return NULL;
1891
1892   /* Record the insn to split.  */
1893   ivts = XNEW (struct iv_to_split);
1894   ivts->insn = insn;
1895   ivts->orig_var = dest;
1896   ivts->base_var = NULL_RTX;
1897   ivts->step = iv.step;
1898   ivts->next = NULL;
1899   ivts->n_loc = 1;
1900   ivts->loc[0] = 1;
1901
1902   return ivts;
1903 }
1904
1905 /* Determines which of insns in LOOP can be optimized.
1906    Return a OPT_INFO struct with the relevant hash tables filled
1907    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1908    is undefined for the return value.  */
1909
1910 static struct opt_info *
1911 analyze_insns_in_loop (struct loop *loop)
1912 {
1913   basic_block *body, bb;
1914   unsigned i;
1915   struct opt_info *opt_info = XCNEW (struct opt_info);
1916   rtx insn;
1917   struct iv_to_split *ivts = NULL;
1918   struct var_to_expand *ves = NULL;
1919   iv_to_split **slot1;
1920   var_to_expand **slot2;
1921   vec<edge> edges = get_loop_exit_edges (loop);
1922   edge exit;
1923   bool can_apply = false;
1924
1925   iv_analysis_loop_init (loop);
1926
1927   body = get_loop_body (loop);
1928
1929   if (flag_split_ivs_in_unroller)
1930     {
1931       opt_info->insns_to_split.create (5 * loop->num_nodes);
1932       opt_info->iv_to_split_head = NULL;
1933       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1934     }
1935
1936   /* Record the loop exit bb and loop preheader before the unrolling.  */
1937   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1938
1939   if (edges.length () == 1)
1940     {
1941       exit = edges[0];
1942       if (!(exit->flags & EDGE_COMPLEX))
1943         {
1944           opt_info->loop_exit = split_edge (exit);
1945           can_apply = true;
1946         }
1947     }
1948
1949   if (flag_variable_expansion_in_unroller
1950       && can_apply)
1951     {
1952       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1953       opt_info->var_to_expand_head = NULL;
1954       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1955     }
1956
1957   for (i = 0; i < loop->num_nodes; i++)
1958     {
1959       bb = body[i];
1960       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1961         continue;
1962
1963       FOR_BB_INSNS (bb, insn)
1964       {
1965         if (!INSN_P (insn))
1966           continue;
1967
1968         if (opt_info->insns_to_split.is_created ())
1969           ivts = analyze_iv_to_split_insn (insn);
1970
1971         if (ivts)
1972           {
1973             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1974             gcc_assert (*slot1 == NULL);
1975             *slot1 = ivts;
1976             *opt_info->iv_to_split_tail = ivts;
1977             opt_info->iv_to_split_tail = &ivts->next;
1978             continue;
1979           }
1980
1981         if (opt_info->insns_with_var_to_expand.is_created ())
1982           ves = analyze_insn_to_expand_var (loop, insn);
1983
1984         if (ves)
1985           {
1986             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1987             gcc_assert (*slot2 == NULL);
1988             *slot2 = ves;
1989             *opt_info->var_to_expand_tail = ves;
1990             opt_info->var_to_expand_tail = &ves->next;
1991           }
1992       }
1993     }
1994
1995   edges.release ();
1996   free (body);
1997   return opt_info;
1998 }
1999
2000 /* Called just before loop duplication.  Records start of duplicated area
2001    to OPT_INFO.  */
2002
2003 static void
2004 opt_info_start_duplication (struct opt_info *opt_info)
2005 {
2006   if (opt_info)
2007     opt_info->first_new_block = last_basic_block;
2008 }
2009
2010 /* Determine the number of iterations between initialization of the base
2011    variable and the current copy (N_COPY).  N_COPIES is the total number
2012    of newly created copies.  UNROLLING is true if we are unrolling
2013    (not peeling) the loop.  */
2014
2015 static unsigned
2016 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2017 {
2018   if (unrolling)
2019     {
2020       /* If we are unrolling, initialization is done in the original loop
2021          body (number 0).  */
2022       return n_copy;
2023     }
2024   else
2025     {
2026       /* If we are peeling, the copy in that the initialization occurs has
2027          number 1.  The original loop (number 0) is the last.  */
2028       if (n_copy)
2029         return n_copy - 1;
2030       else
2031         return n_copies;
2032     }
2033 }
2034
2035 /* Locate in EXPR the expression corresponding to the location recorded
2036    in IVTS, and return a pointer to the RTX for this location.  */
2037
2038 static rtx *
2039 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2040 {
2041   unsigned i;
2042   rtx *ret = &expr;
2043
2044   for (i = 0; i < ivts->n_loc; i++)
2045     ret = &XEXP (*ret, ivts->loc[i]);
2046
2047   return ret;
2048 }
2049
2050 /* Allocate basic variable for the induction variable chain.  */
2051
2052 static void
2053 allocate_basic_variable (struct iv_to_split *ivts)
2054 {
2055   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2056
2057   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2058 }
2059
2060 /* Insert initialization of basic variable of IVTS before INSN, taking
2061    the initial value from INSN.  */
2062
2063 static void
2064 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2065 {
2066   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2067   rtx seq;
2068
2069   start_sequence ();
2070   expr = force_operand (expr, ivts->base_var);
2071   if (expr != ivts->base_var)
2072     emit_move_insn (ivts->base_var, expr);
2073   seq = get_insns ();
2074   end_sequence ();
2075
2076   emit_insn_before (seq, insn);
2077 }
2078
2079 /* Replace the use of induction variable described in IVTS in INSN
2080    by base variable + DELTA * step.  */
2081
2082 static void
2083 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2084 {
2085   rtx expr, *loc, seq, incr, var;
2086   enum machine_mode mode = GET_MODE (ivts->base_var);
2087   rtx src, dest, set;
2088
2089   /* Construct base + DELTA * step.  */
2090   if (!delta)
2091     expr = ivts->base_var;
2092   else
2093     {
2094       incr = simplify_gen_binary (MULT, mode,
2095                                   ivts->step, gen_int_mode (delta, mode));
2096       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2097                                   ivts->base_var, incr);
2098     }
2099
2100   /* Figure out where to do the replacement.  */
2101   loc = get_ivts_expr (single_set (insn), ivts);
2102
2103   /* If we can make the replacement right away, we're done.  */
2104   if (validate_change (insn, loc, expr, 0))
2105     return;
2106
2107   /* Otherwise, force EXPR into a register and try again.  */
2108   start_sequence ();
2109   var = gen_reg_rtx (mode);
2110   expr = force_operand (expr, var);
2111   if (expr != var)
2112     emit_move_insn (var, expr);
2113   seq = get_insns ();
2114   end_sequence ();
2115   emit_insn_before (seq, insn);
2116
2117   if (validate_change (insn, loc, var, 0))
2118     return;
2119
2120   /* The last chance.  Try recreating the assignment in insn
2121      completely from scratch.  */
2122   set = single_set (insn);
2123   gcc_assert (set);
2124
2125   start_sequence ();
2126   *loc = var;
2127   src = copy_rtx (SET_SRC (set));
2128   dest = copy_rtx (SET_DEST (set));
2129   src = force_operand (src, dest);
2130   if (src != dest)
2131     emit_move_insn (dest, src);
2132   seq = get_insns ();
2133   end_sequence ();
2134
2135   emit_insn_before (seq, insn);
2136   delete_insn (insn);
2137 }
2138
2139
2140 /* Return one expansion of the accumulator recorded in struct VE.  */
2141
2142 static rtx
2143 get_expansion (struct var_to_expand *ve)
2144 {
2145   rtx reg;
2146
2147   if (ve->reuse_expansion == 0)
2148     reg = ve->reg;
2149   else
2150     reg = ve->var_expansions[ve->reuse_expansion - 1];
2151
2152   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2153     ve->reuse_expansion = 0;
2154   else
2155     ve->reuse_expansion++;
2156
2157   return reg;
2158 }
2159
2160
2161 /* Given INSN replace the uses of the accumulator recorded in VE
2162    with a new register.  */
2163
2164 static void
2165 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2166 {
2167   rtx new_reg, set;
2168   bool really_new_expansion = false;
2169
2170   set = single_set (insn);
2171   gcc_assert (set);
2172
2173   /* Generate a new register only if the expansion limit has not been
2174      reached.  Else reuse an already existing expansion.  */
2175   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2176     {
2177       really_new_expansion = true;
2178       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2179     }
2180   else
2181     new_reg = get_expansion (ve);
2182
2183   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2184   if (apply_change_group ())
2185     if (really_new_expansion)
2186       {
2187         ve->var_expansions.safe_push (new_reg);
2188         ve->expansion_count++;
2189       }
2190 }
2191
2192 /* Initialize the variable expansions in loop preheader.  PLACE is the
2193    loop-preheader basic block where the initialization of the
2194    expansions should take place.  The expansions are initialized with
2195    (-0) when the operation is plus or minus to honor sign zero.  This
2196    way we can prevent cases where the sign of the final result is
2197    effected by the sign of the expansion.  Here is an example to
2198    demonstrate this:
2199
2200    for (i = 0 ; i < n; i++)
2201      sum += something;
2202
2203    ==>
2204
2205    sum += something
2206    ....
2207    i = i+1;
2208    sum1 += something
2209    ....
2210    i = i+1
2211    sum2 += something;
2212    ....
2213
2214    When SUM is initialized with -zero and SOMETHING is also -zero; the
2215    final result of sum should be -zero thus the expansions sum1 and sum2
2216    should be initialized with -zero as well (otherwise we will get +zero
2217    as the final result).  */
2218
2219 static void
2220 insert_var_expansion_initialization (struct var_to_expand *ve,
2221                                      basic_block place)
2222 {
2223   rtx seq, var, zero_init;
2224   unsigned i;
2225   enum machine_mode mode = GET_MODE (ve->reg);
2226   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2227
2228   if (ve->var_expansions.length () == 0)
2229     return;
2230
2231   start_sequence ();
2232   switch (ve->op)
2233     {
2234     case FMA:
2235       /* Note that we only accumulate FMA via the ADD operand.  */
2236     case PLUS:
2237     case MINUS:
2238       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2239         {
2240           if (honor_signed_zero_p)
2241             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2242           else
2243             zero_init = CONST0_RTX (mode);
2244           emit_move_insn (var, zero_init);
2245         }
2246       break;
2247
2248     case MULT:
2249       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2250         {
2251           zero_init = CONST1_RTX (GET_MODE (var));
2252           emit_move_insn (var, zero_init);
2253         }
2254       break;
2255
2256     default:
2257       gcc_unreachable ();
2258     }
2259
2260   seq = get_insns ();
2261   end_sequence ();
2262
2263   emit_insn_after (seq, BB_END (place));
2264 }
2265
2266 /* Combine the variable expansions at the loop exit.  PLACE is the
2267    loop exit basic block where the summation of the expansions should
2268    take place.  */
2269
2270 static void
2271 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2272 {
2273   rtx sum = ve->reg;
2274   rtx expr, seq, var, insn;
2275   unsigned i;
2276
2277   if (ve->var_expansions.length () == 0)
2278     return;
2279
2280   start_sequence ();
2281   switch (ve->op)
2282     {
2283     case FMA:
2284       /* Note that we only accumulate FMA via the ADD operand.  */
2285     case PLUS:
2286     case MINUS:
2287       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2288         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2289       break;
2290
2291     case MULT:
2292       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2293         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2294       break;
2295
2296     default:
2297       gcc_unreachable ();
2298     }
2299
2300   expr = force_operand (sum, ve->reg);
2301   if (expr != ve->reg)
2302     emit_move_insn (ve->reg, expr);
2303   seq = get_insns ();
2304   end_sequence ();
2305
2306   insn = BB_HEAD (place);
2307   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2308     insn = NEXT_INSN (insn);
2309
2310   emit_insn_after (seq, insn);
2311 }
2312
2313 /* Strip away REG_EQUAL notes for IVs we're splitting.
2314
2315    Updating REG_EQUAL notes for IVs we split is tricky: We
2316    cannot tell until after unrolling, DF-rescanning, and liveness
2317    updating, whether an EQ_USE is reached by the split IV while
2318    the IV reg is still live.  See PR55006.
2319
2320    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2321    because RTL loop-iv requires us to defer rescanning insns and
2322    any notes attached to them.  So resort to old techniques...  */
2323
2324 static void
2325 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2326 {
2327   struct iv_to_split *ivts;
2328   rtx note = find_reg_equal_equiv_note (insn);
2329   if (! note)
2330     return;
2331   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2332     if (reg_mentioned_p (ivts->orig_var, note))
2333       {
2334         remove_note (insn, note);
2335         return;
2336       }
2337 }
2338
2339 /* Apply loop optimizations in loop copies using the
2340    data which gathered during the unrolling.  Structure
2341    OPT_INFO record that data.
2342
2343    UNROLLING is true if we unrolled (not peeled) the loop.
2344    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2345    the loop (as it should happen in complete unrolling, but not in ordinary
2346    peeling of the loop).  */
2347
2348 static void
2349 apply_opt_in_copies (struct opt_info *opt_info,
2350                      unsigned n_copies, bool unrolling,
2351                      bool rewrite_original_loop)
2352 {
2353   unsigned i, delta;
2354   basic_block bb, orig_bb;
2355   rtx insn, orig_insn, next;
2356   struct iv_to_split ivts_templ, *ivts;
2357   struct var_to_expand ve_templ, *ves;
2358
2359   /* Sanity check -- we need to put initialization in the original loop
2360      body.  */
2361   gcc_assert (!unrolling || rewrite_original_loop);
2362
2363   /* Allocate the basic variables (i0).  */
2364   if (opt_info->insns_to_split.is_created ())
2365     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2366       allocate_basic_variable (ivts);
2367
2368   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2369     {
2370       bb = BASIC_BLOCK (i);
2371       orig_bb = get_bb_original (bb);
2372
2373       /* bb->aux holds position in copy sequence initialized by
2374          duplicate_loop_to_header_edge.  */
2375       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2376                                         unrolling);
2377       bb->aux = 0;
2378       orig_insn = BB_HEAD (orig_bb);
2379       FOR_BB_INSNS_SAFE (bb, insn, next)
2380         {
2381           if (!INSN_P (insn)
2382               || (DEBUG_INSN_P (insn)
2383                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2384             continue;
2385
2386           while (!INSN_P (orig_insn)
2387                  || (DEBUG_INSN_P (orig_insn)
2388                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2389                          == LABEL_DECL)))
2390             orig_insn = NEXT_INSN (orig_insn);
2391
2392           ivts_templ.insn = orig_insn;
2393           ve_templ.insn = orig_insn;
2394
2395           /* Apply splitting iv optimization.  */
2396           if (opt_info->insns_to_split.is_created ())
2397             {
2398               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2399
2400               ivts = opt_info->insns_to_split.find (&ivts_templ);
2401
2402               if (ivts)
2403                 {
2404                   gcc_assert (GET_CODE (PATTERN (insn))
2405                               == GET_CODE (PATTERN (orig_insn)));
2406
2407                   if (!delta)
2408                     insert_base_initialization (ivts, insn);
2409                   split_iv (ivts, insn, delta);
2410                 }
2411             }
2412           /* Apply variable expansion optimization.  */
2413           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2414             {
2415               ves = (struct var_to_expand *)
2416                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2417               if (ves)
2418                 {
2419                   gcc_assert (GET_CODE (PATTERN (insn))
2420                               == GET_CODE (PATTERN (orig_insn)));
2421                   expand_var_during_unrolling (ves, insn);
2422                 }
2423             }
2424           orig_insn = NEXT_INSN (orig_insn);
2425         }
2426     }
2427
2428   if (!rewrite_original_loop)
2429     return;
2430
2431   /* Initialize the variable expansions in the loop preheader
2432      and take care of combining them at the loop exit.  */
2433   if (opt_info->insns_with_var_to_expand.is_created ())
2434     {
2435       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2436         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2437       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2438         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2439     }
2440
2441   /* Rewrite also the original loop body.  Find them as originals of the blocks
2442      in the last copied iteration, i.e. those that have
2443      get_bb_copy (get_bb_original (bb)) == bb.  */
2444   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2445     {
2446       bb = BASIC_BLOCK (i);
2447       orig_bb = get_bb_original (bb);
2448       if (get_bb_copy (orig_bb) != bb)
2449         continue;
2450
2451       delta = determine_split_iv_delta (0, n_copies, unrolling);
2452       for (orig_insn = BB_HEAD (orig_bb);
2453            orig_insn != NEXT_INSN (BB_END (bb));
2454            orig_insn = next)
2455         {
2456           next = NEXT_INSN (orig_insn);
2457
2458           if (!INSN_P (orig_insn))
2459             continue;
2460
2461           ivts_templ.insn = orig_insn;
2462           if (opt_info->insns_to_split.is_created ())
2463             {
2464               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2465
2466               ivts = (struct iv_to_split *)
2467                 opt_info->insns_to_split.find (&ivts_templ);
2468               if (ivts)
2469                 {
2470                   if (!delta)
2471                     insert_base_initialization (ivts, orig_insn);
2472                   split_iv (ivts, orig_insn, delta);
2473                   continue;
2474                 }
2475             }
2476
2477         }
2478     }
2479 }
2480
2481 /* Release OPT_INFO.  */
2482
2483 static void
2484 free_opt_info (struct opt_info *opt_info)
2485 {
2486   if (opt_info->insns_to_split.is_created ())
2487     opt_info->insns_to_split.dispose ();
2488   if (opt_info->insns_with_var_to_expand.is_created ())
2489     {
2490       struct var_to_expand *ves;
2491
2492       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2493         ves->var_expansions.release ();
2494       opt_info->insns_with_var_to_expand.dispose ();
2495     }
2496   free (opt_info);
2497 }