gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Information about optimization applied in
 106    the unrolled loop.  */
 107
 108 struct opt_info
 109 {
 110   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 111   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 112   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 113   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 114                                       to expand.  */
 115   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 116   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 117   unsigned first_new_block;        /* The first basic block that was
 118                                       duplicated.  */
 119   basic_block loop_exit;           /* The loop exit basic block.  */
 120   basic_block loop_preheader;      /* The loop preheader basic block.  */
 121 };
 122
 123 static void decide_unrolling_and_peeling (int);
 124 static void peel_loops_completely (int);
 125 static void decide_peel_simple (struct loop *, int);
 126 static void decide_peel_once_rolling (struct loop *, int);
 127 static void decide_peel_completely (struct loop *, int);
 128 static void decide_unroll_stupid (struct loop *, int);
 129 static void decide_unroll_constant_iterations (struct loop *, int);
 130 static void decide_unroll_runtime_iterations (struct loop *, int);
 131 static void peel_loop_simple (struct loop *);
 132 static void peel_loop_completely (struct loop *);
 133 static void unroll_loop_stupid (struct loop *);
 134 static void unroll_loop_constant_iterations (struct loop *);
 135 static void unroll_loop_runtime_iterations (struct loop *);
 136 static struct opt_info *analyze_insns_in_loop (struct loop *);
 137 static void opt_info_start_duplication (struct opt_info *);
 138 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 139 static void free_opt_info (struct opt_info *);
 140 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 141 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 142 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 143 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 144 static void insert_var_expansion_initialization (struct var_to_expand *,
 145                                                  basic_block);
 146 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 147                                              basic_block);
 148 static rtx get_expansion (struct var_to_expand *);
 149
 150 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 151 void
 152 unroll_and_peel_loops (int flags)
 153 {
 154   struct loop *loop;
 155   bool check;
 156   loop_iterator li;
 157
 158   /* First perform complete loop peeling (it is almost surely a win,
 159      and affects parameters for further decision a lot).  */
 160   peel_loops_completely (flags);
 161
 162   /* Now decide rest of unrolling and peeling.  */
 163   decide_unrolling_and_peeling (flags);
 164
 165   /* Scan the loops, inner ones first.  */
 166   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 167     {
 168       check = true;
 169       /* And perform the appropriate transformations.  */
 170       switch (loop->lpt_decision.decision)
 171         {
 172         case LPT_PEEL_COMPLETELY:
 173           /* Already done.  */
 174           gcc_unreachable ();
 175         case LPT_PEEL_SIMPLE:
 176           peel_loop_simple (loop);
 177           break;
 178         case LPT_UNROLL_CONSTANT:
 179           unroll_loop_constant_iterations (loop);
 180           break;
 181         case LPT_UNROLL_RUNTIME:
 182           unroll_loop_runtime_iterations (loop);
 183           break;
 184         case LPT_UNROLL_STUPID:
 185           unroll_loop_stupid (loop);
 186           break;
 187         case LPT_NONE:
 188           check = false;
 189           break;
 190         default:
 191           gcc_unreachable ();
 192         }
 193       if (check)
 194         {
 195 #ifdef ENABLE_CHECKING
 196           verify_loop_structure ();
 197 #endif
 198         }
 199     }
 200
 201   iv_analysis_done ();
 202 }
 203
 204 /* Check whether exit of the LOOP is at the end of loop body.  */
 205
 206 static bool
 207 loop_exit_at_end_p (struct loop *loop)
 208 {
 209   struct niter_desc *desc = get_simple_loop_desc (loop);
 210   rtx insn;
 211
 212   if (desc->in_edge->dest != loop->latch)
 213     return false;
 214
 215   /* Check that the latch is empty.  */
 216   FOR_BB_INSNS (loop->latch, insn)
 217     {
 218       if (NONDEBUG_INSN_P (insn))
 219         return false;
 220     }
 221
 222   return true;
 223 }
 224
 225 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 226 static void
 227 peel_loops_completely (int flags)
 228 {
 229   struct loop *loop;
 230   loop_iterator li;
 231
 232   /* Scan the loops, the inner ones first.  */
 233   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 234     {
 235       loop->lpt_decision.decision = LPT_NONE;
 236
 237       if (dump_file)
 238         fprintf (dump_file,
 239                  "\n;; *** Considering loop %d for complete peeling ***\n",
 240                  loop->num);
 241
 242       loop->ninsns = num_loop_insns (loop);
 243
 244       decide_peel_once_rolling (loop, flags);
 245       if (loop->lpt_decision.decision == LPT_NONE)
 246         decide_peel_completely (loop, flags);
 247
 248       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 249         {
 250           peel_loop_completely (loop);
 251 #ifdef ENABLE_CHECKING
 252           verify_loop_structure ();
 253 #endif
 254         }
 255     }
 256 }
 257
 258 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 259 static void
 260 decide_unrolling_and_peeling (int flags)
 261 {
 262   struct loop *loop;
 263   loop_iterator li;
 264
 265   /* Scan the loops, inner ones first.  */
 266   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 267     {
 268       loop->lpt_decision.decision = LPT_NONE;
 269
 270       if (dump_file)
 271         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 272
 273       /* Do not peel cold areas.  */
 274       if (optimize_loop_for_size_p (loop))
 275         {
 276           if (dump_file)
 277             fprintf (dump_file, ";; Not considering loop, cold area\n");
 278           continue;
 279         }
 280
 281       /* Can the loop be manipulated?  */
 282       if (!can_duplicate_loop_p (loop))
 283         {
 284           if (dump_file)
 285             fprintf (dump_file,
 286                      ";; Not considering loop, cannot duplicate\n");
 287           continue;
 288         }
 289
 290       /* Skip non-innermost loops.  */
 291       if (loop->inner)
 292         {
 293           if (dump_file)
 294             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 295           continue;
 296         }
 297
 298       loop->ninsns = num_loop_insns (loop);
 299       loop->av_ninsns = average_num_loop_insns (loop);
 300
 301       /* Try transformations one by one in decreasing order of
 302          priority.  */
 303
 304       decide_unroll_constant_iterations (loop, flags);
 305       if (loop->lpt_decision.decision == LPT_NONE)
 306         decide_unroll_runtime_iterations (loop, flags);
 307       if (loop->lpt_decision.decision == LPT_NONE)
 308         decide_unroll_stupid (loop, flags);
 309       if (loop->lpt_decision.decision == LPT_NONE)
 310         decide_peel_simple (loop, flags);
 311     }
 312 }
 313
 314 /* Decide whether the LOOP is once rolling and suitable for complete
 315    peeling.  */
 316 static void
 317 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 318 {
 319   struct niter_desc *desc;
 320
 321   if (dump_file)
 322     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 323
 324   /* Is the loop small enough?  */
 325   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 326     {
 327       if (dump_file)
 328         fprintf (dump_file, ";; Not considering loop, is too big\n");
 329       return;
 330     }
 331
 332   /* Check for simple loops.  */
 333   desc = get_simple_loop_desc (loop);
 334
 335   /* Check number of iterations.  */
 336   if (!desc->simple_p
 337       || desc->assumptions
 338       || desc->infinite
 339       || !desc->const_iter
 340       || (desc->niter != 0
 341           && max_loop_iterations_int (loop) != 0))
 342     {
 343       if (dump_file)
 344         fprintf (dump_file,
 345                  ";; Unable to prove that the loop rolls exactly once\n");
 346       return;
 347     }
 348
 349   /* Success.  */
 350   if (dump_file)
 351     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 352   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 353 }
 354
 355 /* Decide whether the LOOP is suitable for complete peeling.  */
 356 static void
 357 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 358 {
 359   unsigned npeel;
 360   struct niter_desc *desc;
 361
 362   if (dump_file)
 363     fprintf (dump_file, "\n;; Considering peeling completely\n");
 364
 365   /* Skip non-innermost loops.  */
 366   if (loop->inner)
 367     {
 368       if (dump_file)
 369         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 370       return;
 371     }
 372
 373   /* Do not peel cold areas.  */
 374   if (optimize_loop_for_size_p (loop))
 375     {
 376       if (dump_file)
 377         fprintf (dump_file, ";; Not considering loop, cold area\n");
 378       return;
 379     }
 380
 381   /* Can the loop be manipulated?  */
 382   if (!can_duplicate_loop_p (loop))
 383     {
 384       if (dump_file)
 385         fprintf (dump_file,
 386                  ";; Not considering loop, cannot duplicate\n");
 387       return;
 388     }
 389
 390   /* npeel = number of iterations to peel.  */
 391   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 392   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 393     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 394
 395   /* Is the loop small enough?  */
 396   if (!npeel)
 397     {
 398       if (dump_file)
 399         fprintf (dump_file, ";; Not considering loop, is too big\n");
 400       return;
 401     }
 402
 403   /* Check for simple loops.  */
 404   desc = get_simple_loop_desc (loop);
 405
 406   /* Check number of iterations.  */
 407   if (!desc->simple_p
 408       || desc->assumptions
 409       || !desc->const_iter
 410       || desc->infinite)
 411     {
 412       if (dump_file)
 413         fprintf (dump_file,
 414                  ";; Unable to prove that the loop iterates constant times\n");
 415       return;
 416     }
 417
 418   if (desc->niter > npeel - 1)
 419     {
 420       if (dump_file)
 421         {
 422           fprintf (dump_file,
 423                    ";; Not peeling loop completely, rolls too much (");
 424           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 425           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 426         }
 427       return;
 428     }
 429
 430   /* Success.  */
 431   if (dump_file)
 432     fprintf (dump_file, ";; Decided to peel loop completely\n");
 433   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 434 }
 435
 436 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 437    completely.  The transformation done:
 438
 439    for (i = 0; i < 4; i++)
 440      body;
 441
 442    ==>
 443
 444    i = 0;
 445    body; i++;
 446    body; i++;
 447    body; i++;
 448    body; i++;
 449    */
 450 static void
 451 peel_loop_completely (struct loop *loop)
 452 {
 453   sbitmap wont_exit;
 454   unsigned HOST_WIDE_INT npeel;
 455   unsigned i;
 456   vec<edge> remove_edges;
 457   edge ein;
 458   struct niter_desc *desc = get_simple_loop_desc (loop);
 459   struct opt_info *opt_info = NULL;
 460
 461   npeel = desc->niter;
 462
 463   if (npeel)
 464     {
 465       bool ok;
 466
 467       wont_exit = sbitmap_alloc (npeel + 1);
 468       bitmap_ones (wont_exit);
 469       bitmap_clear_bit (wont_exit, 0);
 470       if (desc->noloop_assumptions)
 471         bitmap_clear_bit (wont_exit, 1);
 472
 473       remove_edges.create (0);
 474
 475       if (flag_split_ivs_in_unroller)
 476         opt_info = analyze_insns_in_loop (loop);
 477
 478       opt_info_start_duplication (opt_info);
 479       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 480                                           npeel,
 481                                           wont_exit, desc->out_edge,
 482                                           &remove_edges,
 483                                           DLTHE_FLAG_UPDATE_FREQ
 484                                           | DLTHE_FLAG_COMPLETTE_PEEL
 485                                           | (opt_info
 486                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 487       gcc_assert (ok);
 488
 489       free (wont_exit);
 490
 491       if (opt_info)
 492         {
 493           apply_opt_in_copies (opt_info, npeel, false, true);
 494           free_opt_info (opt_info);
 495         }
 496
 497       /* Remove the exit edges.  */
 498       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 499         remove_path (ein);
 500       remove_edges.release ();
 501     }
 502
 503   ein = desc->in_edge;
 504   free_simple_loop_desc (loop);
 505
 506   /* Now remove the unreachable part of the last iteration and cancel
 507      the loop.  */
 508   remove_path (ein);
 509
 510   if (dump_file)
 511     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 512 }
 513
 514 /* Decide whether to unroll LOOP iterating constant number of times
 515    and how much.  */
 516
 517 static void
 518 decide_unroll_constant_iterations (struct loop *loop, int flags)
 519 {
 520   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 521   struct niter_desc *desc;
 522   double_int iterations;
 523
 524   if (!(flags & UAP_UNROLL))
 525     {
 526       /* We were not asked to, just return back silently.  */
 527       return;
 528     }
 529
 530   if (dump_file)
 531     fprintf (dump_file,
 532              "\n;; Considering unrolling loop with constant "
 533              "number of iterations\n");
 534
 535   /* nunroll = total number of copies of the original loop body in
 536      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 537   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 538   nunroll_by_av
 539     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 540   if (nunroll > nunroll_by_av)
 541     nunroll = nunroll_by_av;
 542   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 543     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 544
 545   /* Skip big loops.  */
 546   if (nunroll <= 1)
 547     {
 548       if (dump_file)
 549         fprintf (dump_file, ";; Not considering loop, is too big\n");
 550       return;
 551     }
 552
 553   /* Check for simple loops.  */
 554   desc = get_simple_loop_desc (loop);
 555
 556   /* Check number of iterations.  */
 557   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 558     {
 559       if (dump_file)
 560         fprintf (dump_file,
 561                  ";; Unable to prove that the loop iterates constant times\n");
 562       return;
 563     }
 564
 565   /* Check whether the loop rolls enough to consider.
 566      Consult also loop bounds and profile; in the case the loop has more
 567      than one exit it may well loop less than determined maximal number
 568      of iterations.  */
 569   if (desc->niter < 2 * nunroll
 570       || ((estimated_loop_iterations (loop, &iterations)
 571            || max_loop_iterations (loop, &iterations))
 572           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 573     {
 574       if (dump_file)
 575         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 576       return;
 577     }
 578
 579   /* Success; now compute number of iterations to unroll.  We alter
 580      nunroll so that as few as possible copies of loop body are
 581      necessary, while still not decreasing the number of unrollings
 582      too much (at most by 1).  */
 583   best_copies = 2 * nunroll + 10;
 584
 585   i = 2 * nunroll + 2;
 586   if (i - 1 >= desc->niter)
 587     i = desc->niter - 2;
 588
 589   for (; i >= nunroll - 1; i--)
 590     {
 591       unsigned exit_mod = desc->niter % (i + 1);
 592
 593       if (!loop_exit_at_end_p (loop))
 594         n_copies = exit_mod + i + 1;
 595       else if (exit_mod != (unsigned) i
 596                || desc->noloop_assumptions != NULL_RTX)
 597         n_copies = exit_mod + i + 2;
 598       else
 599         n_copies = i + 1;
 600
 601       if (n_copies < best_copies)
 602         {
 603           best_copies = n_copies;
 604           best_unroll = i;
 605         }
 606     }
 607
 608   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 609   loop->lpt_decision.times = best_unroll;
 610
 611   if (dump_file)
 612     fprintf (dump_file, ";; Decided to unroll the loop %d times (%d copies).\n",
 613              loop->lpt_decision.times, best_copies);
 614 }
 615
 616 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 617    The transformation does this:
 618
 619    for (i = 0; i < 102; i++)
 620      body;
 621
 622    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 623
 624    i = 0;
 625    body; i++;
 626    body; i++;
 627    while (i < 102)
 628      {
 629        body; i++;
 630        body; i++;
 631        body; i++;
 632        body; i++;
 633      }
 634   */
 635 static void
 636 unroll_loop_constant_iterations (struct loop *loop)
 637 {
 638   unsigned HOST_WIDE_INT niter;
 639   unsigned exit_mod;
 640   sbitmap wont_exit;
 641   unsigned i;
 642   vec<edge> remove_edges;
 643   edge e;
 644   unsigned max_unroll = loop->lpt_decision.times;
 645   struct niter_desc *desc = get_simple_loop_desc (loop);
 646   bool exit_at_end = loop_exit_at_end_p (loop);
 647   struct opt_info *opt_info = NULL;
 648   bool ok;
 649
 650   niter = desc->niter;
 651
 652   /* Should not get here (such loop should be peeled instead).  */
 653   gcc_assert (niter > max_unroll + 1);
 654
 655   exit_mod = niter % (max_unroll + 1);
 656
 657   wont_exit = sbitmap_alloc (max_unroll + 1);
 658   bitmap_ones (wont_exit);
 659
 660   remove_edges.create (0);
 661   if (flag_split_ivs_in_unroller
 662       || flag_variable_expansion_in_unroller)
 663     opt_info = analyze_insns_in_loop (loop);
 664
 665   if (!exit_at_end)
 666     {
 667       /* The exit is not at the end of the loop; leave exit test
 668          in the first copy, so that the loops that start with test
 669          of exit condition have continuous body after unrolling.  */
 670
 671       if (dump_file)
 672         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 673
 674       /* Peel exit_mod iterations.  */
 675       bitmap_clear_bit (wont_exit, 0);
 676       if (desc->noloop_assumptions)
 677         bitmap_clear_bit (wont_exit, 1);
 678
 679       if (exit_mod)
 680         {
 681           opt_info_start_duplication (opt_info);
 682           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 683                                               exit_mod,
 684                                               wont_exit, desc->out_edge,
 685                                               &remove_edges,
 686                                               DLTHE_FLAG_UPDATE_FREQ
 687                                               | (opt_info && exit_mod > 1
 688                                                  ? DLTHE_RECORD_COPY_NUMBER
 689                                                    : 0));
 690           gcc_assert (ok);
 691
 692           if (opt_info && exit_mod > 1)
 693             apply_opt_in_copies (opt_info, exit_mod, false, false);
 694
 695           desc->noloop_assumptions = NULL_RTX;
 696           desc->niter -= exit_mod;
 697           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 698           if (loop->any_estimate
 699               && double_int::from_uhwi (exit_mod).ule
 700                    (loop->nb_iterations_estimate))
 701             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 702           else
 703             loop->any_estimate = false;
 704         }
 705
 706       bitmap_set_bit (wont_exit, 1);
 707     }
 708   else
 709     {
 710       /* Leave exit test in last copy, for the same reason as above if
 711          the loop tests the condition at the end of loop body.  */
 712
 713       if (dump_file)
 714         fprintf (dump_file, ";; Condition at end of loop.\n");
 715
 716       /* We know that niter >= max_unroll + 2; so we do not need to care of
 717          case when we would exit before reaching the loop.  So just peel
 718          exit_mod + 1 iterations.  */
 719       if (exit_mod != max_unroll
 720           || desc->noloop_assumptions)
 721         {
 722           bitmap_clear_bit (wont_exit, 0);
 723           if (desc->noloop_assumptions)
 724             bitmap_clear_bit (wont_exit, 1);
 725
 726           opt_info_start_duplication (opt_info);
 727           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 728                                               exit_mod + 1,
 729                                               wont_exit, desc->out_edge,
 730                                               &remove_edges,
 731                                               DLTHE_FLAG_UPDATE_FREQ
 732                                               | (opt_info && exit_mod > 0
 733                                                  ? DLTHE_RECORD_COPY_NUMBER
 734                                                    : 0));
 735           gcc_assert (ok);
 736
 737           if (opt_info && exit_mod > 0)
 738             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 739
 740           desc->niter -= exit_mod + 1;
 741           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 742           if (loop->any_estimate
 743               && double_int::from_uhwi (exit_mod + 1).ule
 744                    (loop->nb_iterations_estimate))
 745             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 746           else
 747             loop->any_estimate = false;
 748           desc->noloop_assumptions = NULL_RTX;
 749
 750           bitmap_set_bit (wont_exit, 0);
 751           bitmap_set_bit (wont_exit, 1);
 752         }
 753
 754       bitmap_clear_bit (wont_exit, max_unroll);
 755     }
 756
 757   /* Now unroll the loop.  */
 758
 759   opt_info_start_duplication (opt_info);
 760   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 761                                       max_unroll,
 762                                       wont_exit, desc->out_edge,
 763                                       &remove_edges,
 764                                       DLTHE_FLAG_UPDATE_FREQ
 765                                       | (opt_info
 766                                          ? DLTHE_RECORD_COPY_NUMBER
 767                                            : 0));
 768   gcc_assert (ok);
 769
 770   if (opt_info)
 771     {
 772       apply_opt_in_copies (opt_info, max_unroll, true, true);
 773       free_opt_info (opt_info);
 774     }
 775
 776   free (wont_exit);
 777
 778   if (exit_at_end)
 779     {
 780       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 781       /* Find a new in and out edge; they are in the last copy we have made.  */
 782
 783       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 784         {
 785           desc->out_edge = EDGE_SUCC (exit_block, 0);
 786           desc->in_edge = EDGE_SUCC (exit_block, 1);
 787         }
 788       else
 789         {
 790           desc->out_edge = EDGE_SUCC (exit_block, 1);
 791           desc->in_edge = EDGE_SUCC (exit_block, 0);
 792         }
 793     }
 794
 795   desc->niter /= max_unroll + 1;
 796   loop->nb_iterations_upper_bound
 797     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 798                                                                    + 1),
 799                                             TRUNC_DIV_EXPR);
 800   if (loop->any_estimate)
 801     loop->nb_iterations_estimate
 802       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 803                                                                   + 1),
 804                                            TRUNC_DIV_EXPR);
 805   desc->niter_expr = GEN_INT (desc->niter);
 806
 807   /* Remove the edges.  */
 808   FOR_EACH_VEC_ELT (remove_edges, i, e)
 809     remove_path (e);
 810   remove_edges.release ();
 811
 812   if (dump_file)
 813     fprintf (dump_file,
 814              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 815              max_unroll, num_loop_insns (loop));
 816 }
 817
 818 /* Decide whether to unroll LOOP iterating runtime computable number of times
 819    and how much.  */
 820 static void
 821 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 822 {
 823   unsigned nunroll, nunroll_by_av, i;
 824   struct niter_desc *desc;
 825   double_int iterations;
 826
 827   if (!(flags & UAP_UNROLL))
 828     {
 829       /* We were not asked to, just return back silently.  */
 830       return;
 831     }
 832
 833   if (dump_file)
 834     fprintf (dump_file,
 835              "\n;; Considering unrolling loop with runtime "
 836              "computable number of iterations\n");
 837
 838   /* nunroll = total number of copies of the original loop body in
 839      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 840   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 841   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 842   if (nunroll > nunroll_by_av)
 843     nunroll = nunroll_by_av;
 844   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 845     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 846
 847   if (targetm.loop_unroll_adjust)
 848     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 849
 850   /* Skip big loops.  */
 851   if (nunroll <= 1)
 852     {
 853       if (dump_file)
 854         fprintf (dump_file, ";; Not considering loop, is too big\n");
 855       return;
 856     }
 857
 858   /* Check for simple loops.  */
 859   desc = get_simple_loop_desc (loop);
 860
 861   /* Check simpleness.  */
 862   if (!desc->simple_p || desc->assumptions)
 863     {
 864       if (dump_file)
 865         fprintf (dump_file,
 866                  ";; Unable to prove that the number of iterations "
 867                  "can be counted in runtime\n");
 868       return;
 869     }
 870
 871   if (desc->const_iter)
 872     {
 873       if (dump_file)
 874         fprintf (dump_file, ";; Loop iterates constant times\n");
 875       return;
 876     }
 877
 878   /* Check whether the loop rolls.  */
 879   if ((estimated_loop_iterations (loop, &iterations)
 880        || max_loop_iterations (loop, &iterations))
 881       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 882     {
 883       if (dump_file)
 884         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 885       return;
 886     }
 887
 888   /* Success; now force nunroll to be power of 2, as we are unable to
 889      cope with overflows in computation of number of iterations.  */
 890   for (i = 1; 2 * i <= nunroll; i *= 2)
 891     continue;
 892
 893   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 894   loop->lpt_decision.times = i - 1;
 895
 896   if (dump_file)
 897     fprintf (dump_file, ";; Decided to unroll the loop %d times.\n",
 898              loop->lpt_decision.times);
 899 }
 900
 901 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 902    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 903    and NULL is returned instead.  */
 904
 905 basic_block
 906 split_edge_and_insert (edge e, rtx insns)
 907 {
 908   basic_block bb;
 909
 910   if (!insns)
 911     return NULL;
 912   bb = split_edge (e);
 913   emit_insn_after (insns, BB_END (bb));
 914
 915   /* ??? We used to assume that INSNS can contain control flow insns, and
 916      that we had to try to find sub basic blocks in BB to maintain a valid
 917      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 918      and call break_superblocks when going out of cfglayout mode.  But it
 919      turns out that this never happens; and that if it does ever happen,
 920      the TODO_verify_flow at the end of the RTL loop passes would fail.
 921
 922      There are two reasons why we expected we could have control flow insns
 923      in INSNS.  The first is when a comparison has to be done in parts, and
 924      the second is when the number of iterations is computed for loops with
 925      the number of iterations known at runtime.  In both cases, test cases
 926      to get control flow in INSNS appear to be impossible to construct:
 927
 928       * If do_compare_rtx_and_jump needs several branches to do comparison
 929         in a mode that needs comparison by parts, we cannot analyze the
 930         number of iterations of the loop, and we never get to unrolling it.
 931
 932       * The code in expand_divmod that was suspected to cause creation of
 933         branching code seems to be only accessed for signed division.  The
 934         divisions used by # of iterations analysis are always unsigned.
 935         Problems might arise on architectures that emits branching code
 936         for some operations that may appear in the unroller (especially
 937         for division), but we have no such architectures.
 938
 939      Considering all this, it was decided that we should for now assume
 940      that INSNS can in theory contain control flow insns, but in practice
 941      it never does.  So we don't handle the theoretical case, and should
 942      a real failure ever show up, we have a pretty good clue for how to
 943      fix it.  */
 944
 945   return bb;
 946 }
 947
 948 /* Unroll LOOP for which we are able to count number of iterations in runtime
 949    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
 950    extra care for case n < 0):
 951
 952    for (i = 0; i < n; i++)
 953      body;
 954
 955    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 956
 957    i = 0;
 958    mod = n % 4;
 959
 960    switch (mod)
 961      {
 962        case 3:
 963          body; i++;
 964        case 2:
 965          body; i++;
 966        case 1:
 967          body; i++;
 968        case 0: ;
 969      }
 970
 971    while (i < n)
 972      {
 973        body; i++;
 974        body; i++;
 975        body; i++;
 976        body; i++;
 977      }
 978    */
 979 static void
 980 unroll_loop_runtime_iterations (struct loop *loop)
 981 {
 982   rtx old_niter, niter, init_code, branch_code, tmp;
 983   unsigned i, j, p;
 984   basic_block preheader, *body, swtch, ezc_swtch;
 985   vec<basic_block> dom_bbs;
 986   sbitmap wont_exit;
 987   int may_exit_copy;
 988   unsigned n_peel;
 989   vec<edge> remove_edges;
 990   edge e;
 991   bool extra_zero_check, last_may_exit;
 992   unsigned max_unroll = loop->lpt_decision.times;
 993   struct niter_desc *desc = get_simple_loop_desc (loop);
 994   bool exit_at_end = loop_exit_at_end_p (loop);
 995   struct opt_info *opt_info = NULL;
 996   bool ok;
 997
 998   if (flag_split_ivs_in_unroller
 999       || flag_variable_expansion_in_unroller)
1000     opt_info = analyze_insns_in_loop (loop);
1001
1002   /* Remember blocks whose dominators will have to be updated.  */
1003   dom_bbs.create (0);
1004
1005   body = get_loop_body (loop);
1006   for (i = 0; i < loop->num_nodes; i++)
1007     {
1008       vec<basic_block> ldom;
1009       basic_block bb;
1010
1011       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1012       FOR_EACH_VEC_ELT (ldom, j, bb)
1013         if (!flow_bb_inside_loop_p (loop, bb))
1014           dom_bbs.safe_push (bb);
1015
1016       ldom.release ();
1017     }
1018   free (body);
1019
1020   if (!exit_at_end)
1021     {
1022       /* Leave exit in first copy (for explanation why see comment in
1023          unroll_loop_constant_iterations).  */
1024       may_exit_copy = 0;
1025       n_peel = max_unroll - 1;
1026       extra_zero_check = true;
1027       last_may_exit = false;
1028     }
1029   else
1030     {
1031       /* Leave exit in last copy (for explanation why see comment in
1032          unroll_loop_constant_iterations).  */
1033       may_exit_copy = max_unroll;
1034       n_peel = max_unroll;
1035       extra_zero_check = false;
1036       last_may_exit = true;
1037     }
1038
1039   /* Get expression for number of iterations.  */
1040   start_sequence ();
1041   old_niter = niter = gen_reg_rtx (desc->mode);
1042   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1043   if (tmp != niter)
1044     emit_move_insn (niter, tmp);
1045
1046   /* Count modulo by ANDing it with max_unroll; we use the fact that
1047      the number of unrollings is a power of two, and thus this is correct
1048      even if there is overflow in the computation.  */
1049   niter = expand_simple_binop (desc->mode, AND,
1050                                niter,
1051                                GEN_INT (max_unroll),
1052                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1053
1054   init_code = get_insns ();
1055   end_sequence ();
1056   unshare_all_rtl_in_chain (init_code);
1057
1058   /* Precondition the loop.  */
1059   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1060
1061   remove_edges.create (0);
1062
1063   wont_exit = sbitmap_alloc (max_unroll + 2);
1064
1065   /* Peel the first copy of loop body (almost always we must leave exit test
1066      here; the only exception is when we have extra zero check and the number
1067      of iterations is reliable.  Also record the place of (possible) extra
1068      zero check.  */
1069   bitmap_clear (wont_exit);
1070   if (extra_zero_check
1071       && !desc->noloop_assumptions)
1072     bitmap_set_bit (wont_exit, 1);
1073   ezc_swtch = loop_preheader_edge (loop)->src;
1074   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1075                                       1, wont_exit, desc->out_edge,
1076                                       &remove_edges,
1077                                       DLTHE_FLAG_UPDATE_FREQ);
1078   gcc_assert (ok);
1079
1080   /* Record the place where switch will be built for preconditioning.  */
1081   swtch = split_edge (loop_preheader_edge (loop));
1082
1083   for (i = 0; i < n_peel; i++)
1084     {
1085       /* Peel the copy.  */
1086       bitmap_clear (wont_exit);
1087       if (i != n_peel - 1 || !last_may_exit)
1088         bitmap_set_bit (wont_exit, 1);
1089       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1090                                           1, wont_exit, desc->out_edge,
1091                                           &remove_edges,
1092                                           DLTHE_FLAG_UPDATE_FREQ);
1093       gcc_assert (ok);
1094
1095       /* Create item for switch.  */
1096       j = n_peel - i - (extra_zero_check ? 0 : 1);
1097       p = REG_BR_PROB_BASE / (i + 2);
1098
1099       preheader = split_edge (loop_preheader_edge (loop));
1100       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1101                                           block_label (preheader), p,
1102                                           NULL_RTX);
1103
1104       /* We rely on the fact that the compare and jump cannot be optimized out,
1105          and hence the cfg we create is correct.  */
1106       gcc_assert (branch_code != NULL_RTX);
1107
1108       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1109       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1110       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1111       e = make_edge (swtch, preheader,
1112                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1113       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1114       e->probability = p;
1115     }
1116
1117   if (extra_zero_check)
1118     {
1119       /* Add branch for zero iterations.  */
1120       p = REG_BR_PROB_BASE / (max_unroll + 1);
1121       swtch = ezc_swtch;
1122       preheader = split_edge (loop_preheader_edge (loop));
1123       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1124                                           block_label (preheader), p,
1125                                           NULL_RTX);
1126       gcc_assert (branch_code != NULL_RTX);
1127
1128       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1129       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1130       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1131       e = make_edge (swtch, preheader,
1132                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1133       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1134       e->probability = p;
1135     }
1136
1137   /* Recount dominators for outer blocks.  */
1138   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1139
1140   /* And unroll loop.  */
1141
1142   bitmap_ones (wont_exit);
1143   bitmap_clear_bit (wont_exit, may_exit_copy);
1144   opt_info_start_duplication (opt_info);
1145
1146   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1147                                       max_unroll,
1148                                       wont_exit, desc->out_edge,
1149                                       &remove_edges,
1150                                       DLTHE_FLAG_UPDATE_FREQ
1151                                       | (opt_info
1152                                          ? DLTHE_RECORD_COPY_NUMBER
1153                                            : 0));
1154   gcc_assert (ok);
1155
1156   if (opt_info)
1157     {
1158       apply_opt_in_copies (opt_info, max_unroll, true, true);
1159       free_opt_info (opt_info);
1160     }
1161
1162   free (wont_exit);
1163
1164   if (exit_at_end)
1165     {
1166       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1167       /* Find a new in and out edge; they are in the last copy we have
1168          made.  */
1169
1170       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1171         {
1172           desc->out_edge = EDGE_SUCC (exit_block, 0);
1173           desc->in_edge = EDGE_SUCC (exit_block, 1);
1174         }
1175       else
1176         {
1177           desc->out_edge = EDGE_SUCC (exit_block, 1);
1178           desc->in_edge = EDGE_SUCC (exit_block, 0);
1179         }
1180     }
1181
1182   /* Remove the edges.  */
1183   FOR_EACH_VEC_ELT (remove_edges, i, e)
1184     remove_path (e);
1185   remove_edges.release ();
1186
1187   /* We must be careful when updating the number of iterations due to
1188      preconditioning and the fact that the value must be valid at entry
1189      of the loop.  After passing through the above code, we see that
1190      the correct new number of iterations is this:  */
1191   gcc_assert (!desc->const_iter);
1192   desc->niter_expr =
1193     simplify_gen_binary (UDIV, desc->mode, old_niter,
1194                          GEN_INT (max_unroll + 1));
1195   loop->nb_iterations_upper_bound
1196     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1197                                                                    + 1),
1198                                             TRUNC_DIV_EXPR);
1199   if (loop->any_estimate)
1200     loop->nb_iterations_estimate
1201       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1202                                                                   + 1),
1203                                            TRUNC_DIV_EXPR);
1204   if (exit_at_end)
1205     {
1206       desc->niter_expr =
1207         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1208       desc->noloop_assumptions = NULL_RTX;
1209       --loop->nb_iterations_upper_bound;
1210       if (loop->any_estimate
1211           && loop->nb_iterations_estimate != double_int_zero)
1212         --loop->nb_iterations_estimate;
1213       else
1214         loop->any_estimate = false;
1215     }
1216
1217   if (dump_file)
1218     fprintf (dump_file,
1219              ";; Unrolled loop %d times, counting # of iterations "
1220              "in runtime, %i insns\n",
1221              max_unroll, num_loop_insns (loop));
1222
1223   dom_bbs.release ();
1224 }
1225
1226 /* Decide whether to simply peel LOOP and how much.  */
1227 static void
1228 decide_peel_simple (struct loop *loop, int flags)
1229 {
1230   unsigned npeel;
1231   double_int iterations;
1232
1233   if (!(flags & UAP_PEEL))
1234     {
1235       /* We were not asked to, just return back silently.  */
1236       return;
1237     }
1238
1239   if (dump_file)
1240     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1241
1242   /* npeel = number of iterations to peel.  */
1243   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1244   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1245     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1246
1247   /* Skip big loops.  */
1248   if (!npeel)
1249     {
1250       if (dump_file)
1251         fprintf (dump_file, ";; Not considering loop, is too big\n");
1252       return;
1253     }
1254
1255   /* Do not simply peel loops with branches inside -- it increases number
1256      of mispredicts.
1257      Exception is when we do have profile and we however have good chance
1258      to peel proper number of iterations loop will iterate in practice.
1259      TODO: this heuristic needs tunning; while for complette unrolling
1260      the branch inside loop mostly eliminates any improvements, for
1261      peeling it is not the case.  Also a function call inside loop is
1262      also branch from branch prediction POV (and probably better reason
1263      to not unroll/peel).  */
1264   if (num_loop_branches (loop) > 1
1265       && profile_status != PROFILE_READ)
1266     {
1267       if (dump_file)
1268         fprintf (dump_file, ";; Not peeling, contains branches\n");
1269       return;
1270     }
1271
1272   /* If we have realistic estimate on number of iterations, use it.  */
1273   if (estimated_loop_iterations (loop, &iterations))
1274     {
1275       if (double_int::from_shwi (npeel).ule (iterations))
1276         {
1277           if (dump_file)
1278             {
1279               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1280               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1281                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1282               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1283                        npeel);
1284             }
1285           return;
1286         }
1287       npeel = iterations.to_shwi () + 1;
1288     }
1289   /* If we have small enough bound on iterations, we can still peel (completely
1290      unroll).  */
1291   else if (max_loop_iterations (loop, &iterations)
1292            && iterations.ult (double_int::from_shwi (npeel)))
1293     npeel = iterations.to_shwi () + 1;
1294   else
1295     {
1296       /* For now we have no good heuristics to decide whether loop peeling
1297          will be effective, so disable it.  */
1298       if (dump_file)
1299         fprintf (dump_file,
1300                  ";; Not peeling loop, no evidence it will be profitable\n");
1301       return;
1302     }
1303
1304   /* Success.  */
1305   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1306   loop->lpt_decision.times = npeel;
1307
1308   if (dump_file)
1309     fprintf (dump_file, ";; Decided to simply peel the loop %d times.\n",
1310              loop->lpt_decision.times);
1311 }
1312
1313 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1314
1315    while (cond)
1316      body;
1317
1318    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1319
1320    if (!cond) goto end;
1321    body;
1322    if (!cond) goto end;
1323    body;
1324    if (!cond) goto end;
1325    body;
1326    while (cond)
1327      body;
1328    end: ;
1329    */
1330 static void
1331 peel_loop_simple (struct loop *loop)
1332 {
1333   sbitmap wont_exit;
1334   unsigned npeel = loop->lpt_decision.times;
1335   struct niter_desc *desc = get_simple_loop_desc (loop);
1336   struct opt_info *opt_info = NULL;
1337   bool ok;
1338
1339   if (flag_split_ivs_in_unroller && npeel > 1)
1340     opt_info = analyze_insns_in_loop (loop);
1341
1342   wont_exit = sbitmap_alloc (npeel + 1);
1343   bitmap_clear (wont_exit);
1344
1345   opt_info_start_duplication (opt_info);
1346
1347   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1348                                       npeel, wont_exit, NULL,
1349                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1350                                       | (opt_info
1351                                          ? DLTHE_RECORD_COPY_NUMBER
1352                                            : 0));
1353   gcc_assert (ok);
1354
1355   free (wont_exit);
1356
1357   if (opt_info)
1358     {
1359       apply_opt_in_copies (opt_info, npeel, false, false);
1360       free_opt_info (opt_info);
1361     }
1362
1363   if (desc->simple_p)
1364     {
1365       if (desc->const_iter)
1366         {
1367           desc->niter -= npeel;
1368           desc->niter_expr = GEN_INT (desc->niter);
1369           desc->noloop_assumptions = NULL_RTX;
1370         }
1371       else
1372         {
1373           /* We cannot just update niter_expr, as its value might be clobbered
1374              inside loop.  We could handle this by counting the number into
1375              temporary just like we do in runtime unrolling, but it does not
1376              seem worthwhile.  */
1377           free_simple_loop_desc (loop);
1378         }
1379     }
1380   if (dump_file)
1381     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1382 }
1383
1384 /* Decide whether to unroll LOOP stupidly and how much.  */
1385 static void
1386 decide_unroll_stupid (struct loop *loop, int flags)
1387 {
1388   unsigned nunroll, nunroll_by_av, i;
1389   struct niter_desc *desc;
1390   double_int iterations;
1391
1392   if (!(flags & UAP_UNROLL_ALL))
1393     {
1394       /* We were not asked to, just return back silently.  */
1395       return;
1396     }
1397
1398   if (dump_file)
1399     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1400
1401   /* nunroll = total number of copies of the original loop body in
1402      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1403   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1404   nunroll_by_av
1405     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1406   if (nunroll > nunroll_by_av)
1407     nunroll = nunroll_by_av;
1408   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1409     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1410
1411   if (targetm.loop_unroll_adjust)
1412     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1413
1414   /* Skip big loops.  */
1415   if (nunroll <= 1)
1416     {
1417       if (dump_file)
1418         fprintf (dump_file, ";; Not considering loop, is too big\n");
1419       return;
1420     }
1421
1422   /* Check for simple loops.  */
1423   desc = get_simple_loop_desc (loop);
1424
1425   /* Check simpleness.  */
1426   if (desc->simple_p && !desc->assumptions)
1427     {
1428       if (dump_file)
1429         fprintf (dump_file, ";; The loop is simple\n");
1430       return;
1431     }
1432
1433   /* Do not unroll loops with branches inside -- it increases number
1434      of mispredicts.
1435      TODO: this heuristic needs tunning; call inside the loop body
1436      is also relatively good reason to not unroll.  */
1437   if (num_loop_branches (loop) > 1)
1438     {
1439       if (dump_file)
1440         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1441       return;
1442     }
1443
1444   /* Check whether the loop rolls.  */
1445   if ((estimated_loop_iterations (loop, &iterations)
1446        || max_loop_iterations (loop, &iterations))
1447       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1448     {
1449       if (dump_file)
1450         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1451       return;
1452     }
1453
1454   /* Success.  Now force nunroll to be power of 2, as it seems that this
1455      improves results (partially because of better alignments, partially
1456      because of some dark magic).  */
1457   for (i = 1; 2 * i <= nunroll; i *= 2)
1458     continue;
1459
1460   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1461   loop->lpt_decision.times = i - 1;
1462
1463   if (dump_file)
1464     fprintf (dump_file, ";; Decided to unroll the loop stupidly %d times.\n",
1465              loop->lpt_decision.times);
1466 }
1467
1468 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1469
1470    while (cond)
1471      body;
1472
1473    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1474
1475    while (cond)
1476      {
1477        body;
1478        if (!cond) break;
1479        body;
1480        if (!cond) break;
1481        body;
1482        if (!cond) break;
1483        body;
1484      }
1485    */
1486 static void
1487 unroll_loop_stupid (struct loop *loop)
1488 {
1489   sbitmap wont_exit;
1490   unsigned nunroll = loop->lpt_decision.times;
1491   struct niter_desc *desc = get_simple_loop_desc (loop);
1492   struct opt_info *opt_info = NULL;
1493   bool ok;
1494
1495   if (flag_split_ivs_in_unroller
1496       || flag_variable_expansion_in_unroller)
1497     opt_info = analyze_insns_in_loop (loop);
1498
1499
1500   wont_exit = sbitmap_alloc (nunroll + 1);
1501   bitmap_clear (wont_exit);
1502   opt_info_start_duplication (opt_info);
1503
1504   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1505                                       nunroll, wont_exit,
1506                                       NULL, NULL,
1507                                       DLTHE_FLAG_UPDATE_FREQ
1508                                       | (opt_info
1509                                          ? DLTHE_RECORD_COPY_NUMBER
1510                                            : 0));
1511   gcc_assert (ok);
1512
1513   if (opt_info)
1514     {
1515       apply_opt_in_copies (opt_info, nunroll, true, true);
1516       free_opt_info (opt_info);
1517     }
1518
1519   free (wont_exit);
1520
1521   if (desc->simple_p)
1522     {
1523       /* We indeed may get here provided that there are nontrivial assumptions
1524          for a loop to be really simple.  We could update the counts, but the
1525          problem is that we are unable to decide which exit will be taken
1526          (not really true in case the number of iterations is constant,
1527          but noone will do anything with this information, so we do not
1528          worry about it).  */
1529       desc->simple_p = false;
1530     }
1531
1532   if (dump_file)
1533     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1534              nunroll, num_loop_insns (loop));
1535 }
1536
1537 /* A hash function for information about insns to split.  */
1538
1539 static hashval_t
1540 si_info_hash (const void *ivts)
1541 {
1542   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1543 }
1544
1545 /* An equality functions for information about insns to split.  */
1546
1547 static int
1548 si_info_eq (const void *ivts1, const void *ivts2)
1549 {
1550   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1551   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1552
1553   return i1->insn == i2->insn;
1554 }
1555
1556 /* Return a hash for VES, which is really a "var_to_expand *".  */
1557
1558 static hashval_t
1559 ve_info_hash (const void *ves)
1560 {
1561   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1562 }
1563
1564 /* Return true if IVTS1 and IVTS2 (which are really both of type
1565    "var_to_expand *") refer to the same instruction.  */
1566
1567 static int
1568 ve_info_eq (const void *ivts1, const void *ivts2)
1569 {
1570   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1571   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1572
1573   return i1->insn == i2->insn;
1574 }
1575
1576 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1577    Set *DEBUG_USES to the number of debug insns that reference the
1578    variable.  */
1579
1580 bool
1581 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1582                                   int *debug_uses)
1583 {
1584   basic_block *body, bb;
1585   unsigned i;
1586   int count_ref = 0;
1587   rtx insn;
1588
1589   body = get_loop_body (loop);
1590   for (i = 0; i < loop->num_nodes; i++)
1591     {
1592       bb = body[i];
1593
1594       FOR_BB_INSNS (bb, insn)
1595         if (!rtx_referenced_p (reg, insn))
1596           continue;
1597         else if (DEBUG_INSN_P (insn))
1598           ++*debug_uses;
1599         else if (++count_ref > 1)
1600           break;
1601     }
1602   free (body);
1603   return (count_ref  == 1);
1604 }
1605
1606 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1607
1608 static void
1609 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1610 {
1611   basic_block *body, bb;
1612   unsigned i;
1613   rtx insn;
1614
1615   body = get_loop_body (loop);
1616   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1617     {
1618       bb = body[i];
1619
1620       FOR_BB_INSNS (bb, insn)
1621         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1622           continue;
1623         else
1624           {
1625             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1626                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1627             if (!--debug_uses)
1628               break;
1629           }
1630     }
1631   free (body);
1632 }
1633
1634 /* Determine whether INSN contains an accumulator
1635    which can be expanded into separate copies,
1636    one for each copy of the LOOP body.
1637
1638    for (i = 0 ; i < n; i++)
1639      sum += a[i];
1640
1641    ==>
1642
1643    sum += a[i]
1644    ....
1645    i = i+1;
1646    sum1 += a[i]
1647    ....
1648    i = i+1
1649    sum2 += a[i];
1650    ....
1651
1652    Return NULL if INSN contains no opportunity for expansion of accumulator.
1653    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1654    information and return a pointer to it.
1655 */
1656
1657 static struct var_to_expand *
1658 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1659 {
1660   rtx set, dest, src;
1661   struct var_to_expand *ves;
1662   unsigned accum_pos;
1663   enum rtx_code code;
1664   int debug_uses = 0;
1665
1666   set = single_set (insn);
1667   if (!set)
1668     return NULL;
1669
1670   dest = SET_DEST (set);
1671   src = SET_SRC (set);
1672   code = GET_CODE (src);
1673
1674   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1675     return NULL;
1676
1677   if (FLOAT_MODE_P (GET_MODE (dest)))
1678     {
1679       if (!flag_associative_math)
1680         return NULL;
1681       /* In the case of FMA, we're also changing the rounding.  */
1682       if (code == FMA && !flag_unsafe_math_optimizations)
1683         return NULL;
1684     }
1685
1686   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1687      in MD.  But if there is no optab to generate the insn, we can not
1688      perform the variable expansion.  This can happen if an MD provides
1689      an insn but not a named pattern to generate it, for example to avoid
1690      producing code that needs additional mode switches like for x87/mmx.
1691
1692      So we check have_insn_for which looks for an optab for the operation
1693      in SRC.  If it doesn't exist, we can't perform the expansion even
1694      though INSN is valid.  */
1695   if (!have_insn_for (code, GET_MODE (src)))
1696     return NULL;
1697
1698   if (!REG_P (dest)
1699       && !(GET_CODE (dest) == SUBREG
1700            && REG_P (SUBREG_REG (dest))))
1701     return NULL;
1702
1703   /* Find the accumulator use within the operation.  */
1704   if (code == FMA)
1705     {
1706       /* We only support accumulation via FMA in the ADD position.  */
1707       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1708         return NULL;
1709       accum_pos = 2;
1710     }
1711   else if (rtx_equal_p (dest, XEXP (src, 0)))
1712     accum_pos = 0;
1713   else if (rtx_equal_p (dest, XEXP (src, 1)))
1714     {
1715       /* The method of expansion that we are using; which includes the
1716          initialization of the expansions with zero and the summation of
1717          the expansions at the end of the computation will yield wrong
1718          results for (x = something - x) thus avoid using it in that case.  */
1719       if (code == MINUS)
1720         return NULL;
1721       accum_pos = 1;
1722     }
1723   else
1724     return NULL;
1725
1726   /* It must not otherwise be used.  */
1727   if (code == FMA)
1728     {
1729       if (rtx_referenced_p (dest, XEXP (src, 0))
1730           || rtx_referenced_p (dest, XEXP (src, 1)))
1731         return NULL;
1732     }
1733   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1734     return NULL;
1735
1736   /* It must be used in exactly one insn.  */
1737   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1738     return NULL;
1739
1740   if (dump_file)
1741     {
1742       fprintf (dump_file, "\n;; Expanding Accumulator ");
1743       print_rtl (dump_file, dest);
1744       fprintf (dump_file, "\n");
1745     }
1746
1747   if (debug_uses)
1748     /* Instead of resetting the debug insns, we could replace each
1749        debug use in the loop with the sum or product of all expanded
1750        accummulators.  Since we'll only know of all expansions at the
1751        end, we'd have to keep track of which vars_to_expand a debug
1752        insn in the loop references, take note of each copy of the
1753        debug insn during unrolling, and when it's all done, compute
1754        the sum or product of each variable and adjust the original
1755        debug insn and each copy thereof.  What a pain!  */
1756     reset_debug_uses_in_loop (loop, dest, debug_uses);
1757
1758   /* Record the accumulator to expand.  */
1759   ves = XNEW (struct var_to_expand);
1760   ves->insn = insn;
1761   ves->reg = copy_rtx (dest);
1762   ves->var_expansions.create (1);
1763   ves->next = NULL;
1764   ves->op = GET_CODE (src);
1765   ves->expansion_count = 0;
1766   ves->reuse_expansion = 0;
1767   return ves;
1768 }
1769
1770 /* Determine whether there is an induction variable in INSN that
1771    we would like to split during unrolling.
1772
1773    I.e. replace
1774
1775    i = i + 1;
1776    ...
1777    i = i + 1;
1778    ...
1779    i = i + 1;
1780    ...
1781
1782    type chains by
1783
1784    i0 = i + 1
1785    ...
1786    i = i0 + 1
1787    ...
1788    i = i0 + 2
1789    ...
1790
1791    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1792    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1793    pointer to it.  */
1794
1795 static struct iv_to_split *
1796 analyze_iv_to_split_insn (rtx insn)
1797 {
1798   rtx set, dest;
1799   struct rtx_iv iv;
1800   struct iv_to_split *ivts;
1801   bool ok;
1802
1803   /* For now we just split the basic induction variables.  Later this may be
1804      extended for example by selecting also addresses of memory references.  */
1805   set = single_set (insn);
1806   if (!set)
1807     return NULL;
1808
1809   dest = SET_DEST (set);
1810   if (!REG_P (dest))
1811     return NULL;
1812
1813   if (!biv_p (insn, dest))
1814     return NULL;
1815
1816   ok = iv_analyze_result (insn, dest, &iv);
1817
1818   /* This used to be an assert under the assumption that if biv_p returns
1819      true that iv_analyze_result must also return true.  However, that
1820      assumption is not strictly correct as evidenced by pr25569.
1821
1822      Returning NULL when iv_analyze_result returns false is safe and
1823      avoids the problems in pr25569 until the iv_analyze_* routines
1824      can be fixed, which is apparently hard and time consuming
1825      according to their author.  */
1826   if (! ok)
1827     return NULL;
1828
1829   if (iv.step == const0_rtx
1830       || iv.mode != iv.extend_mode)
1831     return NULL;
1832
1833   /* Record the insn to split.  */
1834   ivts = XNEW (struct iv_to_split);
1835   ivts->insn = insn;
1836   ivts->base_var = NULL_RTX;
1837   ivts->step = iv.step;
1838   ivts->next = NULL;
1839   ivts->n_loc = 1;
1840   ivts->loc[0] = 1;
1841
1842   return ivts;
1843 }
1844
1845 /* Determines which of insns in LOOP can be optimized.
1846    Return a OPT_INFO struct with the relevant hash tables filled
1847    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1848    is undefined for the return value.  */
1849
1850 static struct opt_info *
1851 analyze_insns_in_loop (struct loop *loop)
1852 {
1853   basic_block *body, bb;
1854   unsigned i;
1855   struct opt_info *opt_info = XCNEW (struct opt_info);
1856   rtx insn;
1857   struct iv_to_split *ivts = NULL;
1858   struct var_to_expand *ves = NULL;
1859   PTR *slot1;
1860   PTR *slot2;
1861   vec<edge> edges = get_loop_exit_edges (loop);
1862   edge exit;
1863   bool can_apply = false;
1864
1865   iv_analysis_loop_init (loop);
1866
1867   body = get_loop_body (loop);
1868
1869   if (flag_split_ivs_in_unroller)
1870     {
1871       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1872                                               si_info_hash, si_info_eq, free);
1873       opt_info->iv_to_split_head = NULL;
1874       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1875     }
1876
1877   /* Record the loop exit bb and loop preheader before the unrolling.  */
1878   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1879
1880   if (edges.length () == 1)
1881     {
1882       exit = edges[0];
1883       if (!(exit->flags & EDGE_COMPLEX))
1884         {
1885           opt_info->loop_exit = split_edge (exit);
1886           can_apply = true;
1887         }
1888     }
1889
1890   if (flag_variable_expansion_in_unroller
1891       && can_apply)
1892     {
1893       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1894                                                         ve_info_hash,
1895                                                         ve_info_eq, free);
1896       opt_info->var_to_expand_head = NULL;
1897       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1898     }
1899
1900   for (i = 0; i < loop->num_nodes; i++)
1901     {
1902       bb = body[i];
1903       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1904         continue;
1905
1906       FOR_BB_INSNS (bb, insn)
1907       {
1908         if (!INSN_P (insn))
1909           continue;
1910
1911         if (opt_info->insns_to_split)
1912           ivts = analyze_iv_to_split_insn (insn);
1913
1914         if (ivts)
1915           {
1916             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1917             gcc_assert (*slot1 == NULL);
1918             *slot1 = ivts;
1919             *opt_info->iv_to_split_tail = ivts;
1920             opt_info->iv_to_split_tail = &ivts->next;
1921             continue;
1922           }
1923
1924         if (opt_info->insns_with_var_to_expand)
1925           ves = analyze_insn_to_expand_var (loop, insn);
1926
1927         if (ves)
1928           {
1929             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1930             gcc_assert (*slot2 == NULL);
1931             *slot2 = ves;
1932             *opt_info->var_to_expand_tail = ves;
1933             opt_info->var_to_expand_tail = &ves->next;
1934           }
1935       }
1936     }
1937
1938   edges.release ();
1939   free (body);
1940   return opt_info;
1941 }
1942
1943 /* Called just before loop duplication.  Records start of duplicated area
1944    to OPT_INFO.  */
1945
1946 static void
1947 opt_info_start_duplication (struct opt_info *opt_info)
1948 {
1949   if (opt_info)
1950     opt_info->first_new_block = last_basic_block;
1951 }
1952
1953 /* Determine the number of iterations between initialization of the base
1954    variable and the current copy (N_COPY).  N_COPIES is the total number
1955    of newly created copies.  UNROLLING is true if we are unrolling
1956    (not peeling) the loop.  */
1957
1958 static unsigned
1959 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1960 {
1961   if (unrolling)
1962     {
1963       /* If we are unrolling, initialization is done in the original loop
1964          body (number 0).  */
1965       return n_copy;
1966     }
1967   else
1968     {
1969       /* If we are peeling, the copy in that the initialization occurs has
1970          number 1.  The original loop (number 0) is the last.  */
1971       if (n_copy)
1972         return n_copy - 1;
1973       else
1974         return n_copies;
1975     }
1976 }
1977
1978 /* Locate in EXPR the expression corresponding to the location recorded
1979    in IVTS, and return a pointer to the RTX for this location.  */
1980
1981 static rtx *
1982 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1983 {
1984   unsigned i;
1985   rtx *ret = &expr;
1986
1987   for (i = 0; i < ivts->n_loc; i++)
1988     ret = &XEXP (*ret, ivts->loc[i]);
1989
1990   return ret;
1991 }
1992
1993 /* Allocate basic variable for the induction variable chain.  */
1994
1995 static void
1996 allocate_basic_variable (struct iv_to_split *ivts)
1997 {
1998   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1999
2000   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2001 }
2002
2003 /* Insert initialization of basic variable of IVTS before INSN, taking
2004    the initial value from INSN.  */
2005
2006 static void
2007 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2008 {
2009   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2010   rtx seq;
2011
2012   start_sequence ();
2013   expr = force_operand (expr, ivts->base_var);
2014   if (expr != ivts->base_var)
2015     emit_move_insn (ivts->base_var, expr);
2016   seq = get_insns ();
2017   end_sequence ();
2018
2019   emit_insn_before (seq, insn);
2020 }
2021
2022 /* Replace the use of induction variable described in IVTS in INSN
2023    by base variable + DELTA * step.  */
2024
2025 static void
2026 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2027 {
2028   rtx expr, *loc, seq, incr, var;
2029   enum machine_mode mode = GET_MODE (ivts->base_var);
2030   rtx src, dest, set;
2031
2032   /* Construct base + DELTA * step.  */
2033   if (!delta)
2034     expr = ivts->base_var;
2035   else
2036     {
2037       incr = simplify_gen_binary (MULT, mode,
2038                                   ivts->step, gen_int_mode (delta, mode));
2039       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2040                                   ivts->base_var, incr);
2041     }
2042
2043   /* Figure out where to do the replacement.  */
2044   loc = get_ivts_expr (single_set (insn), ivts);
2045
2046   /* If we can make the replacement right away, we're done.  */
2047   if (validate_change (insn, loc, expr, 0))
2048     return;
2049
2050   /* Otherwise, force EXPR into a register and try again.  */
2051   start_sequence ();
2052   var = gen_reg_rtx (mode);
2053   expr = force_operand (expr, var);
2054   if (expr != var)
2055     emit_move_insn (var, expr);
2056   seq = get_insns ();
2057   end_sequence ();
2058   emit_insn_before (seq, insn);
2059
2060   if (validate_change (insn, loc, var, 0))
2061     return;
2062
2063   /* The last chance.  Try recreating the assignment in insn
2064      completely from scratch.  */
2065   set = single_set (insn);
2066   gcc_assert (set);
2067
2068   start_sequence ();
2069   *loc = var;
2070   src = copy_rtx (SET_SRC (set));
2071   dest = copy_rtx (SET_DEST (set));
2072   src = force_operand (src, dest);
2073   if (src != dest)
2074     emit_move_insn (dest, src);
2075   seq = get_insns ();
2076   end_sequence ();
2077
2078   emit_insn_before (seq, insn);
2079   delete_insn (insn);
2080 }
2081
2082
2083 /* Return one expansion of the accumulator recorded in struct VE.  */
2084
2085 static rtx
2086 get_expansion (struct var_to_expand *ve)
2087 {
2088   rtx reg;
2089
2090   if (ve->reuse_expansion == 0)
2091     reg = ve->reg;
2092   else
2093     reg = ve->var_expansions[ve->reuse_expansion - 1];
2094
2095   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2096     ve->reuse_expansion = 0;
2097   else
2098     ve->reuse_expansion++;
2099
2100   return reg;
2101 }
2102
2103
2104 /* Given INSN replace the uses of the accumulator recorded in VE
2105    with a new register.  */
2106
2107 static void
2108 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2109 {
2110   rtx new_reg, set;
2111   bool really_new_expansion = false;
2112
2113   set = single_set (insn);
2114   gcc_assert (set);
2115
2116   /* Generate a new register only if the expansion limit has not been
2117      reached.  Else reuse an already existing expansion.  */
2118   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2119     {
2120       really_new_expansion = true;
2121       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2122     }
2123   else
2124     new_reg = get_expansion (ve);
2125
2126   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2127   if (apply_change_group ())
2128     if (really_new_expansion)
2129       {
2130         ve->var_expansions.safe_push (new_reg);
2131         ve->expansion_count++;
2132       }
2133 }
2134
2135 /* Initialize the variable expansions in loop preheader.  PLACE is the
2136    loop-preheader basic block where the initialization of the
2137    expansions should take place.  The expansions are initialized with
2138    (-0) when the operation is plus or minus to honor sign zero.  This
2139    way we can prevent cases where the sign of the final result is
2140    effected by the sign of the expansion.  Here is an example to
2141    demonstrate this:
2142
2143    for (i = 0 ; i < n; i++)
2144      sum += something;
2145
2146    ==>
2147
2148    sum += something
2149    ....
2150    i = i+1;
2151    sum1 += something
2152    ....
2153    i = i+1
2154    sum2 += something;
2155    ....
2156
2157    When SUM is initialized with -zero and SOMETHING is also -zero; the
2158    final result of sum should be -zero thus the expansions sum1 and sum2
2159    should be initialized with -zero as well (otherwise we will get +zero
2160    as the final result).  */
2161
2162 static void
2163 insert_var_expansion_initialization (struct var_to_expand *ve,
2164                                      basic_block place)
2165 {
2166   rtx seq, var, zero_init;
2167   unsigned i;
2168   enum machine_mode mode = GET_MODE (ve->reg);
2169   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2170
2171   if (ve->var_expansions.length () == 0)
2172     return;
2173
2174   start_sequence ();
2175   switch (ve->op)
2176     {
2177     case FMA:
2178       /* Note that we only accumulate FMA via the ADD operand.  */
2179     case PLUS:
2180     case MINUS:
2181       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2182         {
2183           if (honor_signed_zero_p)
2184             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2185           else
2186             zero_init = CONST0_RTX (mode);
2187           emit_move_insn (var, zero_init);
2188         }
2189       break;
2190
2191     case MULT:
2192       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2193         {
2194           zero_init = CONST1_RTX (GET_MODE (var));
2195           emit_move_insn (var, zero_init);
2196         }
2197       break;
2198
2199     default:
2200       gcc_unreachable ();
2201     }
2202
2203   seq = get_insns ();
2204   end_sequence ();
2205
2206   emit_insn_after (seq, BB_END (place));
2207 }
2208
2209 /* Combine the variable expansions at the loop exit.  PLACE is the
2210    loop exit basic block where the summation of the expansions should
2211    take place.  */
2212
2213 static void
2214 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2215 {
2216   rtx sum = ve->reg;
2217   rtx expr, seq, var, insn;
2218   unsigned i;
2219
2220   if (ve->var_expansions.length () == 0)
2221     return;
2222
2223   start_sequence ();
2224   switch (ve->op)
2225     {
2226     case FMA:
2227       /* Note that we only accumulate FMA via the ADD operand.  */
2228     case PLUS:
2229     case MINUS:
2230       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2231         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2232       break;
2233
2234     case MULT:
2235       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2236         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2237       break;
2238
2239     default:
2240       gcc_unreachable ();
2241     }
2242
2243   expr = force_operand (sum, ve->reg);
2244   if (expr != ve->reg)
2245     emit_move_insn (ve->reg, expr);
2246   seq = get_insns ();
2247   end_sequence ();
2248
2249   insn = BB_HEAD (place);
2250   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2251     insn = NEXT_INSN (insn);
2252
2253   emit_insn_after (seq, insn);
2254 }
2255
2256 /* Apply loop optimizations in loop copies using the
2257    data which gathered during the unrolling.  Structure
2258    OPT_INFO record that data.
2259
2260    UNROLLING is true if we unrolled (not peeled) the loop.
2261    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2262    the loop (as it should happen in complete unrolling, but not in ordinary
2263    peeling of the loop).  */
2264
2265 static void
2266 apply_opt_in_copies (struct opt_info *opt_info,
2267                      unsigned n_copies, bool unrolling,
2268                      bool rewrite_original_loop)
2269 {
2270   unsigned i, delta;
2271   basic_block bb, orig_bb;
2272   rtx insn, orig_insn, next;
2273   struct iv_to_split ivts_templ, *ivts;
2274   struct var_to_expand ve_templ, *ves;
2275
2276   /* Sanity check -- we need to put initialization in the original loop
2277      body.  */
2278   gcc_assert (!unrolling || rewrite_original_loop);
2279
2280   /* Allocate the basic variables (i0).  */
2281   if (opt_info->insns_to_split)
2282     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2283       allocate_basic_variable (ivts);
2284
2285   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2286     {
2287       bb = BASIC_BLOCK (i);
2288       orig_bb = get_bb_original (bb);
2289
2290       /* bb->aux holds position in copy sequence initialized by
2291          duplicate_loop_to_header_edge.  */
2292       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2293                                         unrolling);
2294       bb->aux = 0;
2295       orig_insn = BB_HEAD (orig_bb);
2296       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2297         {
2298           next = NEXT_INSN (insn);
2299           if (!INSN_P (insn)
2300               || (DEBUG_INSN_P (insn)
2301                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2302             continue;
2303
2304           while (!INSN_P (orig_insn)
2305                  || (DEBUG_INSN_P (orig_insn)
2306                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2307                          == LABEL_DECL)))
2308             orig_insn = NEXT_INSN (orig_insn);
2309
2310           ivts_templ.insn = orig_insn;
2311           ve_templ.insn = orig_insn;
2312
2313           /* Apply splitting iv optimization.  */
2314           if (opt_info->insns_to_split)
2315             {
2316               ivts = (struct iv_to_split *)
2317                 htab_find (opt_info->insns_to_split, &ivts_templ);
2318
2319               if (ivts)
2320                 {
2321                   gcc_assert (GET_CODE (PATTERN (insn))
2322                               == GET_CODE (PATTERN (orig_insn)));
2323
2324                   if (!delta)
2325                     insert_base_initialization (ivts, insn);
2326                   split_iv (ivts, insn, delta);
2327                 }
2328             }
2329           /* Apply variable expansion optimization.  */
2330           if (unrolling && opt_info->insns_with_var_to_expand)
2331             {
2332               ves = (struct var_to_expand *)
2333                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2334               if (ves)
2335                 {
2336                   gcc_assert (GET_CODE (PATTERN (insn))
2337                               == GET_CODE (PATTERN (orig_insn)));
2338                   expand_var_during_unrolling (ves, insn);
2339                 }
2340             }
2341           orig_insn = NEXT_INSN (orig_insn);
2342         }
2343     }
2344
2345   if (!rewrite_original_loop)
2346     return;
2347
2348   /* Initialize the variable expansions in the loop preheader
2349      and take care of combining them at the loop exit.  */
2350   if (opt_info->insns_with_var_to_expand)
2351     {
2352       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2353         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2354       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2355         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2356     }
2357
2358   /* Rewrite also the original loop body.  Find them as originals of the blocks
2359      in the last copied iteration, i.e. those that have
2360      get_bb_copy (get_bb_original (bb)) == bb.  */
2361   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2362     {
2363       bb = BASIC_BLOCK (i);
2364       orig_bb = get_bb_original (bb);
2365       if (get_bb_copy (orig_bb) != bb)
2366         continue;
2367
2368       delta = determine_split_iv_delta (0, n_copies, unrolling);
2369       for (orig_insn = BB_HEAD (orig_bb);
2370            orig_insn != NEXT_INSN (BB_END (bb));
2371            orig_insn = next)
2372         {
2373           next = NEXT_INSN (orig_insn);
2374
2375           if (!INSN_P (orig_insn))
2376             continue;
2377
2378           ivts_templ.insn = orig_insn;
2379           if (opt_info->insns_to_split)
2380             {
2381               ivts = (struct iv_to_split *)
2382                 htab_find (opt_info->insns_to_split, &ivts_templ);
2383               if (ivts)
2384                 {
2385                   if (!delta)
2386                     insert_base_initialization (ivts, orig_insn);
2387                   split_iv (ivts, orig_insn, delta);
2388                   continue;
2389                 }
2390             }
2391
2392         }
2393     }
2394 }
2395
2396 /* Release OPT_INFO.  */
2397
2398 static void
2399 free_opt_info (struct opt_info *opt_info)
2400 {
2401   if (opt_info->insns_to_split)
2402     htab_delete (opt_info->insns_to_split);
2403   if (opt_info->insns_with_var_to_expand)
2404     {
2405       struct var_to_expand *ves;
2406
2407       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2408         ves->var_expansions.release ();
2409       htab_delete (opt_info->insns_with_var_to_expand);
2410     }
2411   free (opt_info);
2412 }