gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103 };
 104
 105 /* Information about optimization applied in
 106    the unrolled loop.  */
 107
 108 struct opt_info
 109 {
 110   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 111   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 112   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 113   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 114                                       to expand.  */
 115   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 116   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 117   unsigned first_new_block;        /* The first basic block that was
 118                                       duplicated.  */
 119   basic_block loop_exit;           /* The loop exit basic block.  */
 120   basic_block loop_preheader;      /* The loop preheader basic block.  */
 121 };
 122
 123 static void decide_unrolling_and_peeling (int);
 124 static void peel_loops_completely (int);
 125 static void decide_peel_simple (struct loop *, int);
 126 static void decide_peel_once_rolling (struct loop *, int);
 127 static void decide_peel_completely (struct loop *, int);
 128 static void decide_unroll_stupid (struct loop *, int);
 129 static void decide_unroll_constant_iterations (struct loop *, int);
 130 static void decide_unroll_runtime_iterations (struct loop *, int);
 131 static void peel_loop_simple (struct loop *);
 132 static void peel_loop_completely (struct loop *);
 133 static void unroll_loop_stupid (struct loop *);
 134 static void unroll_loop_constant_iterations (struct loop *);
 135 static void unroll_loop_runtime_iterations (struct loop *);
 136 static struct opt_info *analyze_insns_in_loop (struct loop *);
 137 static void opt_info_start_duplication (struct opt_info *);
 138 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 139 static void free_opt_info (struct opt_info *);
 140 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 141 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 142 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 143 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 144 static void insert_var_expansion_initialization (struct var_to_expand *,
 145                                                  basic_block);
 146 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 147                                              basic_block);
 148 static rtx get_expansion (struct var_to_expand *);
 149
 150 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 151 void
 152 unroll_and_peel_loops (int flags)
 153 {
 154   struct loop *loop;
 155   bool check;
 156   loop_iterator li;
 157
 158   /* First perform complete loop peeling (it is almost surely a win,
 159      and affects parameters for further decision a lot).  */
 160   peel_loops_completely (flags);
 161
 162   /* Now decide rest of unrolling and peeling.  */
 163   decide_unrolling_and_peeling (flags);
 164
 165   /* Scan the loops, inner ones first.  */
 166   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 167     {
 168       check = true;
 169       /* And perform the appropriate transformations.  */
 170       switch (loop->lpt_decision.decision)
 171         {
 172         case LPT_PEEL_COMPLETELY:
 173           /* Already done.  */
 174           gcc_unreachable ();
 175         case LPT_PEEL_SIMPLE:
 176           peel_loop_simple (loop);
 177           break;
 178         case LPT_UNROLL_CONSTANT:
 179           unroll_loop_constant_iterations (loop);
 180           break;
 181         case LPT_UNROLL_RUNTIME:
 182           unroll_loop_runtime_iterations (loop);
 183           break;
 184         case LPT_UNROLL_STUPID:
 185           unroll_loop_stupid (loop);
 186           break;
 187         case LPT_NONE:
 188           check = false;
 189           break;
 190         default:
 191           gcc_unreachable ();
 192         }
 193       if (check)
 194         {
 195 #ifdef ENABLE_CHECKING
 196           verify_loop_structure ();
 197 #endif
 198         }
 199     }
 200
 201   iv_analysis_done ();
 202 }
 203
 204 /* Check whether exit of the LOOP is at the end of loop body.  */
 205
 206 static bool
 207 loop_exit_at_end_p (struct loop *loop)
 208 {
 209   struct niter_desc *desc = get_simple_loop_desc (loop);
 210   rtx insn;
 211
 212   if (desc->in_edge->dest != loop->latch)
 213     return false;
 214
 215   /* Check that the latch is empty.  */
 216   FOR_BB_INSNS (loop->latch, insn)
 217     {
 218       if (INSN_P (insn))
 219         return false;
 220     }
 221
 222   return true;
 223 }
 224
 225 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 226 static void
 227 peel_loops_completely (int flags)
 228 {
 229   struct loop *loop;
 230   loop_iterator li;
 231
 232   /* Scan the loops, the inner ones first.  */
 233   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 234     {
 235       loop->lpt_decision.decision = LPT_NONE;
 236
 237       if (dump_file)
 238         fprintf (dump_file,
 239                  "\n;; *** Considering loop %d for complete peeling ***\n",
 240                  loop->num);
 241
 242       loop->ninsns = num_loop_insns (loop);
 243
 244       decide_peel_once_rolling (loop, flags);
 245       if (loop->lpt_decision.decision == LPT_NONE)
 246         decide_peel_completely (loop, flags);
 247
 248       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 249         {
 250           peel_loop_completely (loop);
 251 #ifdef ENABLE_CHECKING
 252           verify_loop_structure ();
 253 #endif
 254         }
 255     }
 256 }
 257
 258 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 259 static void
 260 decide_unrolling_and_peeling (int flags)
 261 {
 262   struct loop *loop;
 263   loop_iterator li;
 264
 265   /* Scan the loops, inner ones first.  */
 266   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 267     {
 268       loop->lpt_decision.decision = LPT_NONE;
 269
 270       if (dump_file)
 271         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 272
 273       /* Do not peel cold areas.  */
 274       if (optimize_loop_for_size_p (loop))
 275         {
 276           if (dump_file)
 277             fprintf (dump_file, ";; Not considering loop, cold area\n");
 278           continue;
 279         }
 280
 281       /* Can the loop be manipulated?  */
 282       if (!can_duplicate_loop_p (loop))
 283         {
 284           if (dump_file)
 285             fprintf (dump_file,
 286                      ";; Not considering loop, cannot duplicate\n");
 287           continue;
 288         }
 289
 290       /* Skip non-innermost loops.  */
 291       if (loop->inner)
 292         {
 293           if (dump_file)
 294             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 295           continue;
 296         }
 297
 298       loop->ninsns = num_loop_insns (loop);
 299       loop->av_ninsns = average_num_loop_insns (loop);
 300
 301       /* Try transformations one by one in decreasing order of
 302          priority.  */
 303
 304       decide_unroll_constant_iterations (loop, flags);
 305       if (loop->lpt_decision.decision == LPT_NONE)
 306         decide_unroll_runtime_iterations (loop, flags);
 307       if (loop->lpt_decision.decision == LPT_NONE)
 308         decide_unroll_stupid (loop, flags);
 309       if (loop->lpt_decision.decision == LPT_NONE)
 310         decide_peel_simple (loop, flags);
 311     }
 312 }
 313
 314 /* Decide whether the LOOP is once rolling and suitable for complete
 315    peeling.  */
 316 static void
 317 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 318 {
 319   struct niter_desc *desc;
 320
 321   if (dump_file)
 322     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 323
 324   /* Is the loop small enough?  */
 325   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 326     {
 327       if (dump_file)
 328         fprintf (dump_file, ";; Not considering loop, is too big\n");
 329       return;
 330     }
 331
 332   /* Check for simple loops.  */
 333   desc = get_simple_loop_desc (loop);
 334
 335   /* Check number of iterations.  */
 336   if (!desc->simple_p
 337       || desc->assumptions
 338       || desc->infinite
 339       || !desc->const_iter
 340       || (desc->niter != 0
 341           && max_loop_iterations_int (loop) != 0))
 342     {
 343       if (dump_file)
 344         fprintf (dump_file,
 345                  ";; Unable to prove that the loop rolls exactly once\n");
 346       return;
 347     }
 348
 349   /* Success.  */
 350   if (dump_file)
 351     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 352   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 353 }
 354
 355 /* Decide whether the LOOP is suitable for complete peeling.  */
 356 static void
 357 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 358 {
 359   unsigned npeel;
 360   struct niter_desc *desc;
 361
 362   if (dump_file)
 363     fprintf (dump_file, "\n;; Considering peeling completely\n");
 364
 365   /* Skip non-innermost loops.  */
 366   if (loop->inner)
 367     {
 368       if (dump_file)
 369         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 370       return;
 371     }
 372
 373   /* Do not peel cold areas.  */
 374   if (optimize_loop_for_size_p (loop))
 375     {
 376       if (dump_file)
 377         fprintf (dump_file, ";; Not considering loop, cold area\n");
 378       return;
 379     }
 380
 381   /* Can the loop be manipulated?  */
 382   if (!can_duplicate_loop_p (loop))
 383     {
 384       if (dump_file)
 385         fprintf (dump_file,
 386                  ";; Not considering loop, cannot duplicate\n");
 387       return;
 388     }
 389
 390   /* npeel = number of iterations to peel.  */
 391   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 392   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 393     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 394
 395   /* Is the loop small enough?  */
 396   if (!npeel)
 397     {
 398       if (dump_file)
 399         fprintf (dump_file, ";; Not considering loop, is too big\n");
 400       return;
 401     }
 402
 403   /* Check for simple loops.  */
 404   desc = get_simple_loop_desc (loop);
 405
 406   /* Check number of iterations.  */
 407   if (!desc->simple_p
 408       || desc->assumptions
 409       || !desc->const_iter
 410       || desc->infinite)
 411     {
 412       if (dump_file)
 413         fprintf (dump_file,
 414                  ";; Unable to prove that the loop iterates constant times\n");
 415       return;
 416     }
 417
 418   if (desc->niter > npeel - 1)
 419     {
 420       if (dump_file)
 421         {
 422           fprintf (dump_file,
 423                    ";; Not peeling loop completely, rolls too much (");
 424           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 425           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 426         }
 427       return;
 428     }
 429
 430   /* Success.  */
 431   if (dump_file)
 432     fprintf (dump_file, ";; Decided to peel loop completely\n");
 433   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 434 }
 435
 436 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 437    completely.  The transformation done:
 438
 439    for (i = 0; i < 4; i++)
 440      body;
 441
 442    ==>
 443
 444    i = 0;
 445    body; i++;
 446    body; i++;
 447    body; i++;
 448    body; i++;
 449    */
 450 static void
 451 peel_loop_completely (struct loop *loop)
 452 {
 453   sbitmap wont_exit;
 454   unsigned HOST_WIDE_INT npeel;
 455   unsigned i;
 456   VEC (edge, heap) *remove_edges;
 457   edge ein;
 458   struct niter_desc *desc = get_simple_loop_desc (loop);
 459   struct opt_info *opt_info = NULL;
 460
 461   npeel = desc->niter;
 462
 463   if (npeel)
 464     {
 465       bool ok;
 466
 467       wont_exit = sbitmap_alloc (npeel + 1);
 468       sbitmap_ones (wont_exit);
 469       RESET_BIT (wont_exit, 0);
 470       if (desc->noloop_assumptions)
 471         RESET_BIT (wont_exit, 1);
 472
 473       remove_edges = NULL;
 474
 475       if (flag_split_ivs_in_unroller)
 476         opt_info = analyze_insns_in_loop (loop);
 477
 478       opt_info_start_duplication (opt_info);
 479       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 480                                           npeel,
 481                                           wont_exit, desc->out_edge,
 482                                           &remove_edges,
 483                                           DLTHE_FLAG_UPDATE_FREQ
 484                                           | DLTHE_FLAG_COMPLETTE_PEEL
 485                                           | (opt_info
 486                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 487       gcc_assert (ok);
 488
 489       free (wont_exit);
 490
 491       if (opt_info)
 492         {
 493           apply_opt_in_copies (opt_info, npeel, false, true);
 494           free_opt_info (opt_info);
 495         }
 496
 497       /* Remove the exit edges.  */
 498       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 499         remove_path (ein);
 500       VEC_free (edge, heap, remove_edges);
 501     }
 502
 503   ein = desc->in_edge;
 504   free_simple_loop_desc (loop);
 505
 506   /* Now remove the unreachable part of the last iteration and cancel
 507      the loop.  */
 508   remove_path (ein);
 509
 510   if (dump_file)
 511     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 512 }
 513
 514 /* Decide whether to unroll LOOP iterating constant number of times
 515    and how much.  */
 516
 517 static void
 518 decide_unroll_constant_iterations (struct loop *loop, int flags)
 519 {
 520   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 521   struct niter_desc *desc;
 522
 523   if (!(flags & UAP_UNROLL))
 524     {
 525       /* We were not asked to, just return back silently.  */
 526       return;
 527     }
 528
 529   if (dump_file)
 530     fprintf (dump_file,
 531              "\n;; Considering unrolling loop with constant "
 532              "number of iterations\n");
 533
 534   /* nunroll = total number of copies of the original loop body in
 535      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 536   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 537   nunroll_by_av
 538     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 539   if (nunroll > nunroll_by_av)
 540     nunroll = nunroll_by_av;
 541   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 542     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 543
 544   /* Skip big loops.  */
 545   if (nunroll <= 1)
 546     {
 547       if (dump_file)
 548         fprintf (dump_file, ";; Not considering loop, is too big\n");
 549       return;
 550     }
 551
 552   /* Check for simple loops.  */
 553   desc = get_simple_loop_desc (loop);
 554
 555   /* Check number of iterations.  */
 556   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 557     {
 558       if (dump_file)
 559         fprintf (dump_file,
 560                  ";; Unable to prove that the loop iterates constant times\n");
 561       return;
 562     }
 563
 564   /* Check whether the loop rolls enough to consider.  */
 565   if (desc->niter < 2 * nunroll)
 566     {
 567       if (dump_file)
 568         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 569       return;
 570     }
 571
 572   /* Success; now compute number of iterations to unroll.  We alter
 573      nunroll so that as few as possible copies of loop body are
 574      necessary, while still not decreasing the number of unrollings
 575      too much (at most by 1).  */
 576   best_copies = 2 * nunroll + 10;
 577
 578   i = 2 * nunroll + 2;
 579   if (i - 1 >= desc->niter)
 580     i = desc->niter - 2;
 581
 582   for (; i >= nunroll - 1; i--)
 583     {
 584       unsigned exit_mod = desc->niter % (i + 1);
 585
 586       if (!loop_exit_at_end_p (loop))
 587         n_copies = exit_mod + i + 1;
 588       else if (exit_mod != (unsigned) i
 589                || desc->noloop_assumptions != NULL_RTX)
 590         n_copies = exit_mod + i + 2;
 591       else
 592         n_copies = i + 1;
 593
 594       if (n_copies < best_copies)
 595         {
 596           best_copies = n_copies;
 597           best_unroll = i;
 598         }
 599     }
 600
 601   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 602   loop->lpt_decision.times = best_unroll;
 603
 604   if (dump_file)
 605     fprintf (dump_file, ";; Decided to unroll the loop %d times (%d copies).\n",
 606              loop->lpt_decision.times, best_copies);
 607 }
 608
 609 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 610    The transformation does this:
 611
 612    for (i = 0; i < 102; i++)
 613      body;
 614
 615    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 616
 617    i = 0;
 618    body; i++;
 619    body; i++;
 620    while (i < 102)
 621      {
 622        body; i++;
 623        body; i++;
 624        body; i++;
 625        body; i++;
 626      }
 627   */
 628 static void
 629 unroll_loop_constant_iterations (struct loop *loop)
 630 {
 631   unsigned HOST_WIDE_INT niter;
 632   unsigned exit_mod;
 633   sbitmap wont_exit;
 634   unsigned i;
 635   VEC (edge, heap) *remove_edges;
 636   edge e;
 637   unsigned max_unroll = loop->lpt_decision.times;
 638   struct niter_desc *desc = get_simple_loop_desc (loop);
 639   bool exit_at_end = loop_exit_at_end_p (loop);
 640   struct opt_info *opt_info = NULL;
 641   bool ok;
 642
 643   niter = desc->niter;
 644
 645   /* Should not get here (such loop should be peeled instead).  */
 646   gcc_assert (niter > max_unroll + 1);
 647
 648   exit_mod = niter % (max_unroll + 1);
 649
 650   wont_exit = sbitmap_alloc (max_unroll + 1);
 651   sbitmap_ones (wont_exit);
 652
 653   remove_edges = NULL;
 654   if (flag_split_ivs_in_unroller
 655       || flag_variable_expansion_in_unroller)
 656     opt_info = analyze_insns_in_loop (loop);
 657
 658   if (!exit_at_end)
 659     {
 660       /* The exit is not at the end of the loop; leave exit test
 661          in the first copy, so that the loops that start with test
 662          of exit condition have continuous body after unrolling.  */
 663
 664       if (dump_file)
 665         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 666
 667       /* Peel exit_mod iterations.  */
 668       RESET_BIT (wont_exit, 0);
 669       if (desc->noloop_assumptions)
 670         RESET_BIT (wont_exit, 1);
 671
 672       if (exit_mod)
 673         {
 674           opt_info_start_duplication (opt_info);
 675           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 676                                               exit_mod,
 677                                               wont_exit, desc->out_edge,
 678                                               &remove_edges,
 679                                               DLTHE_FLAG_UPDATE_FREQ
 680                                               | (opt_info && exit_mod > 1
 681                                                  ? DLTHE_RECORD_COPY_NUMBER
 682                                                    : 0));
 683           gcc_assert (ok);
 684
 685           if (opt_info && exit_mod > 1)
 686             apply_opt_in_copies (opt_info, exit_mod, false, false);
 687
 688           desc->noloop_assumptions = NULL_RTX;
 689           desc->niter -= exit_mod;
 690           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 691           if (loop->any_estimate
 692               && double_int::from_uhwi (exit_mod).ule
 693                    (loop->nb_iterations_estimate))
 694             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 695           else
 696             loop->any_estimate = false;
 697         }
 698
 699       SET_BIT (wont_exit, 1);
 700     }
 701   else
 702     {
 703       /* Leave exit test in last copy, for the same reason as above if
 704          the loop tests the condition at the end of loop body.  */
 705
 706       if (dump_file)
 707         fprintf (dump_file, ";; Condition at end of loop.\n");
 708
 709       /* We know that niter >= max_unroll + 2; so we do not need to care of
 710          case when we would exit before reaching the loop.  So just peel
 711          exit_mod + 1 iterations.  */
 712       if (exit_mod != max_unroll
 713           || desc->noloop_assumptions)
 714         {
 715           RESET_BIT (wont_exit, 0);
 716           if (desc->noloop_assumptions)
 717             RESET_BIT (wont_exit, 1);
 718
 719           opt_info_start_duplication (opt_info);
 720           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 721                                               exit_mod + 1,
 722                                               wont_exit, desc->out_edge,
 723                                               &remove_edges,
 724                                               DLTHE_FLAG_UPDATE_FREQ
 725                                               | (opt_info && exit_mod > 0
 726                                                  ? DLTHE_RECORD_COPY_NUMBER
 727                                                    : 0));
 728           gcc_assert (ok);
 729
 730           if (opt_info && exit_mod > 0)
 731             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 732
 733           desc->niter -= exit_mod + 1;
 734           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 735           if (loop->any_estimate
 736               && double_int::from_uhwi (exit_mod + 1).ule
 737                    (loop->nb_iterations_estimate))
 738             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 739           else
 740             loop->any_estimate = false;
 741           desc->noloop_assumptions = NULL_RTX;
 742
 743           SET_BIT (wont_exit, 0);
 744           SET_BIT (wont_exit, 1);
 745         }
 746
 747       RESET_BIT (wont_exit, max_unroll);
 748     }
 749
 750   /* Now unroll the loop.  */
 751
 752   opt_info_start_duplication (opt_info);
 753   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 754                                       max_unroll,
 755                                       wont_exit, desc->out_edge,
 756                                       &remove_edges,
 757                                       DLTHE_FLAG_UPDATE_FREQ
 758                                       | (opt_info
 759                                          ? DLTHE_RECORD_COPY_NUMBER
 760                                            : 0));
 761   gcc_assert (ok);
 762
 763   if (opt_info)
 764     {
 765       apply_opt_in_copies (opt_info, max_unroll, true, true);
 766       free_opt_info (opt_info);
 767     }
 768
 769   free (wont_exit);
 770
 771   if (exit_at_end)
 772     {
 773       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 774       /* Find a new in and out edge; they are in the last copy we have made.  */
 775
 776       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 777         {
 778           desc->out_edge = EDGE_SUCC (exit_block, 0);
 779           desc->in_edge = EDGE_SUCC (exit_block, 1);
 780         }
 781       else
 782         {
 783           desc->out_edge = EDGE_SUCC (exit_block, 1);
 784           desc->in_edge = EDGE_SUCC (exit_block, 0);
 785         }
 786     }
 787
 788   desc->niter /= max_unroll + 1;
 789   loop->nb_iterations_upper_bound
 790     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 791                                                                    + 1),
 792                                             TRUNC_DIV_EXPR);
 793   if (loop->any_estimate)
 794     loop->nb_iterations_estimate
 795       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 796                                                                   + 1),
 797                                            TRUNC_DIV_EXPR);
 798   desc->niter_expr = GEN_INT (desc->niter);
 799
 800   /* Remove the edges.  */
 801   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 802     remove_path (e);
 803   VEC_free (edge, heap, remove_edges);
 804
 805   if (dump_file)
 806     fprintf (dump_file,
 807              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 808              max_unroll, num_loop_insns (loop));
 809 }
 810
 811 /* Decide whether to unroll LOOP iterating runtime computable number of times
 812    and how much.  */
 813 static void
 814 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 815 {
 816   unsigned nunroll, nunroll_by_av, i;
 817   struct niter_desc *desc;
 818   double_int iterations;
 819
 820   if (!(flags & UAP_UNROLL))
 821     {
 822       /* We were not asked to, just return back silently.  */
 823       return;
 824     }
 825
 826   if (dump_file)
 827     fprintf (dump_file,
 828              "\n;; Considering unrolling loop with runtime "
 829              "computable number of iterations\n");
 830
 831   /* nunroll = total number of copies of the original loop body in
 832      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 833   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 834   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 835   if (nunroll > nunroll_by_av)
 836     nunroll = nunroll_by_av;
 837   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 838     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 839
 840   if (targetm.loop_unroll_adjust)
 841     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 842
 843   /* Skip big loops.  */
 844   if (nunroll <= 1)
 845     {
 846       if (dump_file)
 847         fprintf (dump_file, ";; Not considering loop, is too big\n");
 848       return;
 849     }
 850
 851   /* Check for simple loops.  */
 852   desc = get_simple_loop_desc (loop);
 853
 854   /* Check simpleness.  */
 855   if (!desc->simple_p || desc->assumptions)
 856     {
 857       if (dump_file)
 858         fprintf (dump_file,
 859                  ";; Unable to prove that the number of iterations "
 860                  "can be counted in runtime\n");
 861       return;
 862     }
 863
 864   if (desc->const_iter)
 865     {
 866       if (dump_file)
 867         fprintf (dump_file, ";; Loop iterates constant times\n");
 868       return;
 869     }
 870
 871   /* Check whether the loop rolls.  */
 872   if ((estimated_loop_iterations (loop, &iterations)
 873        || max_loop_iterations (loop, &iterations))
 874       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 875     {
 876       if (dump_file)
 877         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 878       return;
 879     }
 880
 881   /* Success; now force nunroll to be power of 2, as we are unable to
 882      cope with overflows in computation of number of iterations.  */
 883   for (i = 1; 2 * i <= nunroll; i *= 2)
 884     continue;
 885
 886   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 887   loop->lpt_decision.times = i - 1;
 888
 889   if (dump_file)
 890     fprintf (dump_file, ";; Decided to unroll the loop %d times.\n",
 891              loop->lpt_decision.times);
 892 }
 893
 894 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 895    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 896    and NULL is returned instead.  */
 897
 898 basic_block
 899 split_edge_and_insert (edge e, rtx insns)
 900 {
 901   basic_block bb;
 902
 903   if (!insns)
 904     return NULL;
 905   bb = split_edge (e);
 906   emit_insn_after (insns, BB_END (bb));
 907
 908   /* ??? We used to assume that INSNS can contain control flow insns, and
 909      that we had to try to find sub basic blocks in BB to maintain a valid
 910      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 911      and call break_superblocks when going out of cfglayout mode.  But it
 912      turns out that this never happens; and that if it does ever happen,
 913      the TODO_verify_flow at the end of the RTL loop passes would fail.
 914
 915      There are two reasons why we expected we could have control flow insns
 916      in INSNS.  The first is when a comparison has to be done in parts, and
 917      the second is when the number of iterations is computed for loops with
 918      the number of iterations known at runtime.  In both cases, test cases
 919      to get control flow in INSNS appear to be impossible to construct:
 920
 921       * If do_compare_rtx_and_jump needs several branches to do comparison
 922         in a mode that needs comparison by parts, we cannot analyze the
 923         number of iterations of the loop, and we never get to unrolling it.
 924
 925       * The code in expand_divmod that was suspected to cause creation of
 926         branching code seems to be only accessed for signed division.  The
 927         divisions used by # of iterations analysis are always unsigned.
 928         Problems might arise on architectures that emits branching code
 929         for some operations that may appear in the unroller (especially
 930         for division), but we have no such architectures.
 931
 932      Considering all this, it was decided that we should for now assume
 933      that INSNS can in theory contain control flow insns, but in practice
 934      it never does.  So we don't handle the theoretical case, and should
 935      a real failure ever show up, we have a pretty good clue for how to
 936      fix it.  */
 937
 938   return bb;
 939 }
 940
 941 /* Unroll LOOP for which we are able to count number of iterations in runtime
 942    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
 943    extra care for case n < 0):
 944
 945    for (i = 0; i < n; i++)
 946      body;
 947
 948    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 949
 950    i = 0;
 951    mod = n % 4;
 952
 953    switch (mod)
 954      {
 955        case 3:
 956          body; i++;
 957        case 2:
 958          body; i++;
 959        case 1:
 960          body; i++;
 961        case 0: ;
 962      }
 963
 964    while (i < n)
 965      {
 966        body; i++;
 967        body; i++;
 968        body; i++;
 969        body; i++;
 970      }
 971    */
 972 static void
 973 unroll_loop_runtime_iterations (struct loop *loop)
 974 {
 975   rtx old_niter, niter, init_code, branch_code, tmp;
 976   unsigned i, j, p;
 977   basic_block preheader, *body, swtch, ezc_swtch;
 978   VEC (basic_block, heap) *dom_bbs;
 979   sbitmap wont_exit;
 980   int may_exit_copy;
 981   unsigned n_peel;
 982   VEC (edge, heap) *remove_edges;
 983   edge e;
 984   bool extra_zero_check, last_may_exit;
 985   unsigned max_unroll = loop->lpt_decision.times;
 986   struct niter_desc *desc = get_simple_loop_desc (loop);
 987   bool exit_at_end = loop_exit_at_end_p (loop);
 988   struct opt_info *opt_info = NULL;
 989   bool ok;
 990
 991   if (flag_split_ivs_in_unroller
 992       || flag_variable_expansion_in_unroller)
 993     opt_info = analyze_insns_in_loop (loop);
 994
 995   /* Remember blocks whose dominators will have to be updated.  */
 996   dom_bbs = NULL;
 997
 998   body = get_loop_body (loop);
 999   for (i = 0; i < loop->num_nodes; i++)
1000     {
1001       VEC (basic_block, heap) *ldom;
1002       basic_block bb;
1003
1004       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1005       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
1006         if (!flow_bb_inside_loop_p (loop, bb))
1007           VEC_safe_push (basic_block, heap, dom_bbs, bb);
1008
1009       VEC_free (basic_block, heap, ldom);
1010     }
1011   free (body);
1012
1013   if (!exit_at_end)
1014     {
1015       /* Leave exit in first copy (for explanation why see comment in
1016          unroll_loop_constant_iterations).  */
1017       may_exit_copy = 0;
1018       n_peel = max_unroll - 1;
1019       extra_zero_check = true;
1020       last_may_exit = false;
1021     }
1022   else
1023     {
1024       /* Leave exit in last copy (for explanation why see comment in
1025          unroll_loop_constant_iterations).  */
1026       may_exit_copy = max_unroll;
1027       n_peel = max_unroll;
1028       extra_zero_check = false;
1029       last_may_exit = true;
1030     }
1031
1032   /* Get expression for number of iterations.  */
1033   start_sequence ();
1034   old_niter = niter = gen_reg_rtx (desc->mode);
1035   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1036   if (tmp != niter)
1037     emit_move_insn (niter, tmp);
1038
1039   /* Count modulo by ANDing it with max_unroll; we use the fact that
1040      the number of unrollings is a power of two, and thus this is correct
1041      even if there is overflow in the computation.  */
1042   niter = expand_simple_binop (desc->mode, AND,
1043                                niter,
1044                                GEN_INT (max_unroll),
1045                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1046
1047   init_code = get_insns ();
1048   end_sequence ();
1049   unshare_all_rtl_in_chain (init_code);
1050
1051   /* Precondition the loop.  */
1052   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1053
1054   remove_edges = NULL;
1055
1056   wont_exit = sbitmap_alloc (max_unroll + 2);
1057
1058   /* Peel the first copy of loop body (almost always we must leave exit test
1059      here; the only exception is when we have extra zero check and the number
1060      of iterations is reliable.  Also record the place of (possible) extra
1061      zero check.  */
1062   sbitmap_zero (wont_exit);
1063   if (extra_zero_check
1064       && !desc->noloop_assumptions)
1065     SET_BIT (wont_exit, 1);
1066   ezc_swtch = loop_preheader_edge (loop)->src;
1067   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1068                                       1, wont_exit, desc->out_edge,
1069                                       &remove_edges,
1070                                       DLTHE_FLAG_UPDATE_FREQ);
1071   gcc_assert (ok);
1072
1073   /* Record the place where switch will be built for preconditioning.  */
1074   swtch = split_edge (loop_preheader_edge (loop));
1075
1076   for (i = 0; i < n_peel; i++)
1077     {
1078       /* Peel the copy.  */
1079       sbitmap_zero (wont_exit);
1080       if (i != n_peel - 1 || !last_may_exit)
1081         SET_BIT (wont_exit, 1);
1082       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1083                                           1, wont_exit, desc->out_edge,
1084                                           &remove_edges,
1085                                           DLTHE_FLAG_UPDATE_FREQ);
1086       gcc_assert (ok);
1087
1088       /* Create item for switch.  */
1089       j = n_peel - i - (extra_zero_check ? 0 : 1);
1090       p = REG_BR_PROB_BASE / (i + 2);
1091
1092       preheader = split_edge (loop_preheader_edge (loop));
1093       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1094                                           block_label (preheader), p,
1095                                           NULL_RTX);
1096
1097       /* We rely on the fact that the compare and jump cannot be optimized out,
1098          and hence the cfg we create is correct.  */
1099       gcc_assert (branch_code != NULL_RTX);
1100
1101       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1102       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1103       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1104       e = make_edge (swtch, preheader,
1105                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1106       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1107       e->probability = p;
1108     }
1109
1110   if (extra_zero_check)
1111     {
1112       /* Add branch for zero iterations.  */
1113       p = REG_BR_PROB_BASE / (max_unroll + 1);
1114       swtch = ezc_swtch;
1115       preheader = split_edge (loop_preheader_edge (loop));
1116       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1117                                           block_label (preheader), p,
1118                                           NULL_RTX);
1119       gcc_assert (branch_code != NULL_RTX);
1120
1121       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1122       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1123       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1124       e = make_edge (swtch, preheader,
1125                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1126       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1127       e->probability = p;
1128     }
1129
1130   /* Recount dominators for outer blocks.  */
1131   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1132
1133   /* And unroll loop.  */
1134
1135   sbitmap_ones (wont_exit);
1136   RESET_BIT (wont_exit, may_exit_copy);
1137   opt_info_start_duplication (opt_info);
1138
1139   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1140                                       max_unroll,
1141                                       wont_exit, desc->out_edge,
1142                                       &remove_edges,
1143                                       DLTHE_FLAG_UPDATE_FREQ
1144                                       | (opt_info
1145                                          ? DLTHE_RECORD_COPY_NUMBER
1146                                            : 0));
1147   gcc_assert (ok);
1148
1149   if (opt_info)
1150     {
1151       apply_opt_in_copies (opt_info, max_unroll, true, true);
1152       free_opt_info (opt_info);
1153     }
1154
1155   free (wont_exit);
1156
1157   if (exit_at_end)
1158     {
1159       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1160       /* Find a new in and out edge; they are in the last copy we have
1161          made.  */
1162
1163       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1164         {
1165           desc->out_edge = EDGE_SUCC (exit_block, 0);
1166           desc->in_edge = EDGE_SUCC (exit_block, 1);
1167         }
1168       else
1169         {
1170           desc->out_edge = EDGE_SUCC (exit_block, 1);
1171           desc->in_edge = EDGE_SUCC (exit_block, 0);
1172         }
1173     }
1174
1175   /* Remove the edges.  */
1176   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1177     remove_path (e);
1178   VEC_free (edge, heap, remove_edges);
1179
1180   /* We must be careful when updating the number of iterations due to
1181      preconditioning and the fact that the value must be valid at entry
1182      of the loop.  After passing through the above code, we see that
1183      the correct new number of iterations is this:  */
1184   gcc_assert (!desc->const_iter);
1185   desc->niter_expr =
1186     simplify_gen_binary (UDIV, desc->mode, old_niter,
1187                          GEN_INT (max_unroll + 1));
1188   loop->nb_iterations_upper_bound
1189     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1190                                                                    + 1),
1191                                             TRUNC_DIV_EXPR);
1192   if (loop->any_estimate)
1193     loop->nb_iterations_estimate
1194       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1195                                                                   + 1),
1196                                            TRUNC_DIV_EXPR);
1197   if (exit_at_end)
1198     {
1199       desc->niter_expr =
1200         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1201       desc->noloop_assumptions = NULL_RTX;
1202       --loop->nb_iterations_upper_bound;
1203       if (loop->any_estimate
1204           && loop->nb_iterations_estimate != double_int_zero)
1205         --loop->nb_iterations_estimate;
1206       else
1207         loop->any_estimate = false;
1208     }
1209
1210   if (dump_file)
1211     fprintf (dump_file,
1212              ";; Unrolled loop %d times, counting # of iterations "
1213              "in runtime, %i insns\n",
1214              max_unroll, num_loop_insns (loop));
1215
1216   VEC_free (basic_block, heap, dom_bbs);
1217 }
1218
1219 /* Decide whether to simply peel LOOP and how much.  */
1220 static void
1221 decide_peel_simple (struct loop *loop, int flags)
1222 {
1223   unsigned npeel;
1224   struct niter_desc *desc;
1225   double_int iterations;
1226
1227   if (!(flags & UAP_PEEL))
1228     {
1229       /* We were not asked to, just return back silently.  */
1230       return;
1231     }
1232
1233   if (dump_file)
1234     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1235
1236   /* npeel = number of iterations to peel.  */
1237   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1238   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1239     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1240
1241   /* Skip big loops.  */
1242   if (!npeel)
1243     {
1244       if (dump_file)
1245         fprintf (dump_file, ";; Not considering loop, is too big\n");
1246       return;
1247     }
1248
1249   /* Check for simple loops.  */
1250   desc = get_simple_loop_desc (loop);
1251
1252   /* Check number of iterations.  */
1253   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1254     {
1255       if (dump_file)
1256         fprintf (dump_file, ";; Loop iterates constant times\n");
1257       return;
1258     }
1259
1260   /* Do not simply peel loops with branches inside -- it increases number
1261      of mispredicts.  */
1262   if (num_loop_branches (loop) > 1)
1263     {
1264       if (dump_file)
1265         fprintf (dump_file, ";; Not peeling, contains branches\n");
1266       return;
1267     }
1268
1269   /* If we have realistic estimate on number of iterations, use it.  */
1270   if (estimated_loop_iterations (loop, &iterations))
1271     {
1272       if (double_int::from_shwi (npeel).ule (iterations))
1273         {
1274           if (dump_file)
1275             {
1276               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1277               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1278                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1279               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1280                        npeel);
1281             }
1282           return;
1283         }
1284       npeel = iterations.to_shwi () + 1;
1285     }
1286   /* If we have small enough bound on iterations, we can still peel (completely
1287      unroll).  */
1288   else if (max_loop_iterations (loop, &iterations)
1289            && iterations.ult (double_int::from_shwi (npeel)))
1290     npeel = iterations.to_shwi () + 1;
1291   else
1292     {
1293       /* For now we have no good heuristics to decide whether loop peeling
1294          will be effective, so disable it.  */
1295       if (dump_file)
1296         fprintf (dump_file,
1297                  ";; Not peeling loop, no evidence it will be profitable\n");
1298       return;
1299     }
1300
1301   /* Success.  */
1302   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1303   loop->lpt_decision.times = npeel;
1304
1305   if (dump_file)
1306     fprintf (dump_file, ";; Decided to simply peel the loop %d times.\n",
1307              loop->lpt_decision.times);
1308 }
1309
1310 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1311
1312    while (cond)
1313      body;
1314
1315    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1316
1317    if (!cond) goto end;
1318    body;
1319    if (!cond) goto end;
1320    body;
1321    if (!cond) goto end;
1322    body;
1323    while (cond)
1324      body;
1325    end: ;
1326    */
1327 static void
1328 peel_loop_simple (struct loop *loop)
1329 {
1330   sbitmap wont_exit;
1331   unsigned npeel = loop->lpt_decision.times;
1332   struct niter_desc *desc = get_simple_loop_desc (loop);
1333   struct opt_info *opt_info = NULL;
1334   bool ok;
1335
1336   if (flag_split_ivs_in_unroller && npeel > 1)
1337     opt_info = analyze_insns_in_loop (loop);
1338
1339   wont_exit = sbitmap_alloc (npeel + 1);
1340   sbitmap_zero (wont_exit);
1341
1342   opt_info_start_duplication (opt_info);
1343
1344   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1345                                       npeel, wont_exit, NULL,
1346                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1347                                       | (opt_info
1348                                          ? DLTHE_RECORD_COPY_NUMBER
1349                                            : 0));
1350   gcc_assert (ok);
1351
1352   free (wont_exit);
1353
1354   if (opt_info)
1355     {
1356       apply_opt_in_copies (opt_info, npeel, false, false);
1357       free_opt_info (opt_info);
1358     }
1359
1360   if (desc->simple_p)
1361     {
1362       if (desc->const_iter)
1363         {
1364           desc->niter -= npeel;
1365           desc->niter_expr = GEN_INT (desc->niter);
1366           desc->noloop_assumptions = NULL_RTX;
1367         }
1368       else
1369         {
1370           /* We cannot just update niter_expr, as its value might be clobbered
1371              inside loop.  We could handle this by counting the number into
1372              temporary just like we do in runtime unrolling, but it does not
1373              seem worthwhile.  */
1374           free_simple_loop_desc (loop);
1375         }
1376     }
1377   if (dump_file)
1378     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1379 }
1380
1381 /* Decide whether to unroll LOOP stupidly and how much.  */
1382 static void
1383 decide_unroll_stupid (struct loop *loop, int flags)
1384 {
1385   unsigned nunroll, nunroll_by_av, i;
1386   struct niter_desc *desc;
1387   double_int iterations;
1388
1389   if (!(flags & UAP_UNROLL_ALL))
1390     {
1391       /* We were not asked to, just return back silently.  */
1392       return;
1393     }
1394
1395   if (dump_file)
1396     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1397
1398   /* nunroll = total number of copies of the original loop body in
1399      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1400   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1401   nunroll_by_av
1402     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1403   if (nunroll > nunroll_by_av)
1404     nunroll = nunroll_by_av;
1405   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1406     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1407
1408   if (targetm.loop_unroll_adjust)
1409     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1410
1411   /* Skip big loops.  */
1412   if (nunroll <= 1)
1413     {
1414       if (dump_file)
1415         fprintf (dump_file, ";; Not considering loop, is too big\n");
1416       return;
1417     }
1418
1419   /* Check for simple loops.  */
1420   desc = get_simple_loop_desc (loop);
1421
1422   /* Check simpleness.  */
1423   if (desc->simple_p && !desc->assumptions)
1424     {
1425       if (dump_file)
1426         fprintf (dump_file, ";; The loop is simple\n");
1427       return;
1428     }
1429
1430   /* Do not unroll loops with branches inside -- it increases number
1431      of mispredicts.  */
1432   if (num_loop_branches (loop) > 1)
1433     {
1434       if (dump_file)
1435         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1436       return;
1437     }
1438
1439   /* Check whether the loop rolls.  */
1440   if ((estimated_loop_iterations (loop, &iterations)
1441        || max_loop_iterations (loop, &iterations))
1442       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1443     {
1444       if (dump_file)
1445         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1446       return;
1447     }
1448
1449   /* Success.  Now force nunroll to be power of 2, as it seems that this
1450      improves results (partially because of better alignments, partially
1451      because of some dark magic).  */
1452   for (i = 1; 2 * i <= nunroll; i *= 2)
1453     continue;
1454
1455   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1456   loop->lpt_decision.times = i - 1;
1457
1458   if (dump_file)
1459     fprintf (dump_file, ";; Decided to unroll the loop stupidly %d times.\n",
1460              loop->lpt_decision.times);
1461 }
1462
1463 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1464
1465    while (cond)
1466      body;
1467
1468    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1469
1470    while (cond)
1471      {
1472        body;
1473        if (!cond) break;
1474        body;
1475        if (!cond) break;
1476        body;
1477        if (!cond) break;
1478        body;
1479      }
1480    */
1481 static void
1482 unroll_loop_stupid (struct loop *loop)
1483 {
1484   sbitmap wont_exit;
1485   unsigned nunroll = loop->lpt_decision.times;
1486   struct niter_desc *desc = get_simple_loop_desc (loop);
1487   struct opt_info *opt_info = NULL;
1488   bool ok;
1489
1490   if (flag_split_ivs_in_unroller
1491       || flag_variable_expansion_in_unroller)
1492     opt_info = analyze_insns_in_loop (loop);
1493
1494
1495   wont_exit = sbitmap_alloc (nunroll + 1);
1496   sbitmap_zero (wont_exit);
1497   opt_info_start_duplication (opt_info);
1498
1499   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1500                                       nunroll, wont_exit,
1501                                       NULL, NULL,
1502                                       DLTHE_FLAG_UPDATE_FREQ
1503                                       | (opt_info
1504                                          ? DLTHE_RECORD_COPY_NUMBER
1505                                            : 0));
1506   gcc_assert (ok);
1507
1508   if (opt_info)
1509     {
1510       apply_opt_in_copies (opt_info, nunroll, true, true);
1511       free_opt_info (opt_info);
1512     }
1513
1514   free (wont_exit);
1515
1516   if (desc->simple_p)
1517     {
1518       /* We indeed may get here provided that there are nontrivial assumptions
1519          for a loop to be really simple.  We could update the counts, but the
1520          problem is that we are unable to decide which exit will be taken
1521          (not really true in case the number of iterations is constant,
1522          but noone will do anything with this information, so we do not
1523          worry about it).  */
1524       desc->simple_p = false;
1525     }
1526
1527   if (dump_file)
1528     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1529              nunroll, num_loop_insns (loop));
1530 }
1531
1532 /* A hash function for information about insns to split.  */
1533
1534 static hashval_t
1535 si_info_hash (const void *ivts)
1536 {
1537   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1538 }
1539
1540 /* An equality functions for information about insns to split.  */
1541
1542 static int
1543 si_info_eq (const void *ivts1, const void *ivts2)
1544 {
1545   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1546   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1547
1548   return i1->insn == i2->insn;
1549 }
1550
1551 /* Return a hash for VES, which is really a "var_to_expand *".  */
1552
1553 static hashval_t
1554 ve_info_hash (const void *ves)
1555 {
1556   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1557 }
1558
1559 /* Return true if IVTS1 and IVTS2 (which are really both of type
1560    "var_to_expand *") refer to the same instruction.  */
1561
1562 static int
1563 ve_info_eq (const void *ivts1, const void *ivts2)
1564 {
1565   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1566   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1567
1568   return i1->insn == i2->insn;
1569 }
1570
1571 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1572    Set *DEBUG_USES to the number of debug insns that reference the
1573    variable.  */
1574
1575 bool
1576 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1577                                   int *debug_uses)
1578 {
1579   basic_block *body, bb;
1580   unsigned i;
1581   int count_ref = 0;
1582   rtx insn;
1583
1584   body = get_loop_body (loop);
1585   for (i = 0; i < loop->num_nodes; i++)
1586     {
1587       bb = body[i];
1588
1589       FOR_BB_INSNS (bb, insn)
1590         if (!rtx_referenced_p (reg, insn))
1591           continue;
1592         else if (DEBUG_INSN_P (insn))
1593           ++*debug_uses;
1594         else if (++count_ref > 1)
1595           break;
1596     }
1597   free (body);
1598   return (count_ref  == 1);
1599 }
1600
1601 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1602
1603 static void
1604 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1605 {
1606   basic_block *body, bb;
1607   unsigned i;
1608   rtx insn;
1609
1610   body = get_loop_body (loop);
1611   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1612     {
1613       bb = body[i];
1614
1615       FOR_BB_INSNS (bb, insn)
1616         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1617           continue;
1618         else
1619           {
1620             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1621                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1622             if (!--debug_uses)
1623               break;
1624           }
1625     }
1626   free (body);
1627 }
1628
1629 /* Determine whether INSN contains an accumulator
1630    which can be expanded into separate copies,
1631    one for each copy of the LOOP body.
1632
1633    for (i = 0 ; i < n; i++)
1634      sum += a[i];
1635
1636    ==>
1637
1638    sum += a[i]
1639    ....
1640    i = i+1;
1641    sum1 += a[i]
1642    ....
1643    i = i+1
1644    sum2 += a[i];
1645    ....
1646
1647    Return NULL if INSN contains no opportunity for expansion of accumulator.
1648    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1649    information and return a pointer to it.
1650 */
1651
1652 static struct var_to_expand *
1653 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1654 {
1655   rtx set, dest, src;
1656   struct var_to_expand *ves;
1657   unsigned accum_pos;
1658   enum rtx_code code;
1659   int debug_uses = 0;
1660
1661   set = single_set (insn);
1662   if (!set)
1663     return NULL;
1664
1665   dest = SET_DEST (set);
1666   src = SET_SRC (set);
1667   code = GET_CODE (src);
1668
1669   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1670     return NULL;
1671
1672   if (FLOAT_MODE_P (GET_MODE (dest)))
1673     {
1674       if (!flag_associative_math)
1675         return NULL;
1676       /* In the case of FMA, we're also changing the rounding.  */
1677       if (code == FMA && !flag_unsafe_math_optimizations)
1678         return NULL;
1679     }
1680
1681   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1682      in MD.  But if there is no optab to generate the insn, we can not
1683      perform the variable expansion.  This can happen if an MD provides
1684      an insn but not a named pattern to generate it, for example to avoid
1685      producing code that needs additional mode switches like for x87/mmx.
1686
1687      So we check have_insn_for which looks for an optab for the operation
1688      in SRC.  If it doesn't exist, we can't perform the expansion even
1689      though INSN is valid.  */
1690   if (!have_insn_for (code, GET_MODE (src)))
1691     return NULL;
1692
1693   if (!REG_P (dest)
1694       && !(GET_CODE (dest) == SUBREG
1695            && REG_P (SUBREG_REG (dest))))
1696     return NULL;
1697
1698   /* Find the accumulator use within the operation.  */
1699   if (code == FMA)
1700     {
1701       /* We only support accumulation via FMA in the ADD position.  */
1702       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1703         return NULL;
1704       accum_pos = 2;
1705     }
1706   else if (rtx_equal_p (dest, XEXP (src, 0)))
1707     accum_pos = 0;
1708   else if (rtx_equal_p (dest, XEXP (src, 1)))
1709     {
1710       /* The method of expansion that we are using; which includes the
1711          initialization of the expansions with zero and the summation of
1712          the expansions at the end of the computation will yield wrong
1713          results for (x = something - x) thus avoid using it in that case.  */
1714       if (code == MINUS)
1715         return NULL;
1716       accum_pos = 1;
1717     }
1718   else
1719     return NULL;
1720
1721   /* It must not otherwise be used.  */
1722   if (code == FMA)
1723     {
1724       if (rtx_referenced_p (dest, XEXP (src, 0))
1725           || rtx_referenced_p (dest, XEXP (src, 1)))
1726         return NULL;
1727     }
1728   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1729     return NULL;
1730
1731   /* It must be used in exactly one insn.  */
1732   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1733     return NULL;
1734
1735   if (dump_file)
1736     {
1737       fprintf (dump_file, "\n;; Expanding Accumulator ");
1738       print_rtl (dump_file, dest);
1739       fprintf (dump_file, "\n");
1740     }
1741
1742   if (debug_uses)
1743     /* Instead of resetting the debug insns, we could replace each
1744        debug use in the loop with the sum or product of all expanded
1745        accummulators.  Since we'll only know of all expansions at the
1746        end, we'd have to keep track of which vars_to_expand a debug
1747        insn in the loop references, take note of each copy of the
1748        debug insn during unrolling, and when it's all done, compute
1749        the sum or product of each variable and adjust the original
1750        debug insn and each copy thereof.  What a pain!  */
1751     reset_debug_uses_in_loop (loop, dest, debug_uses);
1752
1753   /* Record the accumulator to expand.  */
1754   ves = XNEW (struct var_to_expand);
1755   ves->insn = insn;
1756   ves->reg = copy_rtx (dest);
1757   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1758   ves->next = NULL;
1759   ves->op = GET_CODE (src);
1760   ves->expansion_count = 0;
1761   ves->reuse_expansion = 0;
1762   return ves;
1763 }
1764
1765 /* Determine whether there is an induction variable in INSN that
1766    we would like to split during unrolling.
1767
1768    I.e. replace
1769
1770    i = i + 1;
1771    ...
1772    i = i + 1;
1773    ...
1774    i = i + 1;
1775    ...
1776
1777    type chains by
1778
1779    i0 = i + 1
1780    ...
1781    i = i0 + 1
1782    ...
1783    i = i0 + 2
1784    ...
1785
1786    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1787    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1788    pointer to it.  */
1789
1790 static struct iv_to_split *
1791 analyze_iv_to_split_insn (rtx insn)
1792 {
1793   rtx set, dest;
1794   struct rtx_iv iv;
1795   struct iv_to_split *ivts;
1796   bool ok;
1797
1798   /* For now we just split the basic induction variables.  Later this may be
1799      extended for example by selecting also addresses of memory references.  */
1800   set = single_set (insn);
1801   if (!set)
1802     return NULL;
1803
1804   dest = SET_DEST (set);
1805   if (!REG_P (dest))
1806     return NULL;
1807
1808   if (!biv_p (insn, dest))
1809     return NULL;
1810
1811   ok = iv_analyze_result (insn, dest, &iv);
1812
1813   /* This used to be an assert under the assumption that if biv_p returns
1814      true that iv_analyze_result must also return true.  However, that
1815      assumption is not strictly correct as evidenced by pr25569.
1816
1817      Returning NULL when iv_analyze_result returns false is safe and
1818      avoids the problems in pr25569 until the iv_analyze_* routines
1819      can be fixed, which is apparently hard and time consuming
1820      according to their author.  */
1821   if (! ok)
1822     return NULL;
1823
1824   if (iv.step == const0_rtx
1825       || iv.mode != iv.extend_mode)
1826     return NULL;
1827
1828   /* Record the insn to split.  */
1829   ivts = XNEW (struct iv_to_split);
1830   ivts->insn = insn;
1831   ivts->base_var = NULL_RTX;
1832   ivts->step = iv.step;
1833   ivts->next = NULL;
1834   ivts->n_loc = 1;
1835   ivts->loc[0] = 1;
1836
1837   return ivts;
1838 }
1839
1840 /* Determines which of insns in LOOP can be optimized.
1841    Return a OPT_INFO struct with the relevant hash tables filled
1842    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1843    is undefined for the return value.  */
1844
1845 static struct opt_info *
1846 analyze_insns_in_loop (struct loop *loop)
1847 {
1848   basic_block *body, bb;
1849   unsigned i;
1850   struct opt_info *opt_info = XCNEW (struct opt_info);
1851   rtx insn;
1852   struct iv_to_split *ivts = NULL;
1853   struct var_to_expand *ves = NULL;
1854   PTR *slot1;
1855   PTR *slot2;
1856   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1857   edge exit;
1858   bool can_apply = false;
1859
1860   iv_analysis_loop_init (loop);
1861
1862   body = get_loop_body (loop);
1863
1864   if (flag_split_ivs_in_unroller)
1865     {
1866       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1867                                               si_info_hash, si_info_eq, free);
1868       opt_info->iv_to_split_head = NULL;
1869       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1870     }
1871
1872   /* Record the loop exit bb and loop preheader before the unrolling.  */
1873   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1874
1875   if (VEC_length (edge, edges) == 1)
1876     {
1877       exit = VEC_index (edge, edges, 0);
1878       if (!(exit->flags & EDGE_COMPLEX))
1879         {
1880           opt_info->loop_exit = split_edge (exit);
1881           can_apply = true;
1882         }
1883     }
1884
1885   if (flag_variable_expansion_in_unroller
1886       && can_apply)
1887     {
1888       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1889                                                         ve_info_hash,
1890                                                         ve_info_eq, free);
1891       opt_info->var_to_expand_head = NULL;
1892       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1893     }
1894
1895   for (i = 0; i < loop->num_nodes; i++)
1896     {
1897       bb = body[i];
1898       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1899         continue;
1900
1901       FOR_BB_INSNS (bb, insn)
1902       {
1903         if (!INSN_P (insn))
1904           continue;
1905
1906         if (opt_info->insns_to_split)
1907           ivts = analyze_iv_to_split_insn (insn);
1908
1909         if (ivts)
1910           {
1911             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1912             gcc_assert (*slot1 == NULL);
1913             *slot1 = ivts;
1914             *opt_info->iv_to_split_tail = ivts;
1915             opt_info->iv_to_split_tail = &ivts->next;
1916             continue;
1917           }
1918
1919         if (opt_info->insns_with_var_to_expand)
1920           ves = analyze_insn_to_expand_var (loop, insn);
1921
1922         if (ves)
1923           {
1924             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1925             gcc_assert (*slot2 == NULL);
1926             *slot2 = ves;
1927             *opt_info->var_to_expand_tail = ves;
1928             opt_info->var_to_expand_tail = &ves->next;
1929           }
1930       }
1931     }
1932
1933   VEC_free (edge, heap, edges);
1934   free (body);
1935   return opt_info;
1936 }
1937
1938 /* Called just before loop duplication.  Records start of duplicated area
1939    to OPT_INFO.  */
1940
1941 static void
1942 opt_info_start_duplication (struct opt_info *opt_info)
1943 {
1944   if (opt_info)
1945     opt_info->first_new_block = last_basic_block;
1946 }
1947
1948 /* Determine the number of iterations between initialization of the base
1949    variable and the current copy (N_COPY).  N_COPIES is the total number
1950    of newly created copies.  UNROLLING is true if we are unrolling
1951    (not peeling) the loop.  */
1952
1953 static unsigned
1954 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1955 {
1956   if (unrolling)
1957     {
1958       /* If we are unrolling, initialization is done in the original loop
1959          body (number 0).  */
1960       return n_copy;
1961     }
1962   else
1963     {
1964       /* If we are peeling, the copy in that the initialization occurs has
1965          number 1.  The original loop (number 0) is the last.  */
1966       if (n_copy)
1967         return n_copy - 1;
1968       else
1969         return n_copies;
1970     }
1971 }
1972
1973 /* Locate in EXPR the expression corresponding to the location recorded
1974    in IVTS, and return a pointer to the RTX for this location.  */
1975
1976 static rtx *
1977 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1978 {
1979   unsigned i;
1980   rtx *ret = &expr;
1981
1982   for (i = 0; i < ivts->n_loc; i++)
1983     ret = &XEXP (*ret, ivts->loc[i]);
1984
1985   return ret;
1986 }
1987
1988 /* Allocate basic variable for the induction variable chain.  */
1989
1990 static void
1991 allocate_basic_variable (struct iv_to_split *ivts)
1992 {
1993   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1994
1995   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1996 }
1997
1998 /* Insert initialization of basic variable of IVTS before INSN, taking
1999    the initial value from INSN.  */
2000
2001 static void
2002 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2003 {
2004   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2005   rtx seq;
2006
2007   start_sequence ();
2008   expr = force_operand (expr, ivts->base_var);
2009   if (expr != ivts->base_var)
2010     emit_move_insn (ivts->base_var, expr);
2011   seq = get_insns ();
2012   end_sequence ();
2013
2014   emit_insn_before (seq, insn);
2015 }
2016
2017 /* Replace the use of induction variable described in IVTS in INSN
2018    by base variable + DELTA * step.  */
2019
2020 static void
2021 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2022 {
2023   rtx expr, *loc, seq, incr, var;
2024   enum machine_mode mode = GET_MODE (ivts->base_var);
2025   rtx src, dest, set;
2026
2027   /* Construct base + DELTA * step.  */
2028   if (!delta)
2029     expr = ivts->base_var;
2030   else
2031     {
2032       incr = simplify_gen_binary (MULT, mode,
2033                                   ivts->step, gen_int_mode (delta, mode));
2034       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2035                                   ivts->base_var, incr);
2036     }
2037
2038   /* Figure out where to do the replacement.  */
2039   loc = get_ivts_expr (single_set (insn), ivts);
2040
2041   /* If we can make the replacement right away, we're done.  */
2042   if (validate_change (insn, loc, expr, 0))
2043     return;
2044
2045   /* Otherwise, force EXPR into a register and try again.  */
2046   start_sequence ();
2047   var = gen_reg_rtx (mode);
2048   expr = force_operand (expr, var);
2049   if (expr != var)
2050     emit_move_insn (var, expr);
2051   seq = get_insns ();
2052   end_sequence ();
2053   emit_insn_before (seq, insn);
2054
2055   if (validate_change (insn, loc, var, 0))
2056     return;
2057
2058   /* The last chance.  Try recreating the assignment in insn
2059      completely from scratch.  */
2060   set = single_set (insn);
2061   gcc_assert (set);
2062
2063   start_sequence ();
2064   *loc = var;
2065   src = copy_rtx (SET_SRC (set));
2066   dest = copy_rtx (SET_DEST (set));
2067   src = force_operand (src, dest);
2068   if (src != dest)
2069     emit_move_insn (dest, src);
2070   seq = get_insns ();
2071   end_sequence ();
2072
2073   emit_insn_before (seq, insn);
2074   delete_insn (insn);
2075 }
2076
2077
2078 /* Return one expansion of the accumulator recorded in struct VE.  */
2079
2080 static rtx
2081 get_expansion (struct var_to_expand *ve)
2082 {
2083   rtx reg;
2084
2085   if (ve->reuse_expansion == 0)
2086     reg = ve->reg;
2087   else
2088     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2089
2090   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2091     ve->reuse_expansion = 0;
2092   else
2093     ve->reuse_expansion++;
2094
2095   return reg;
2096 }
2097
2098
2099 /* Given INSN replace the uses of the accumulator recorded in VE
2100    with a new register.  */
2101
2102 static void
2103 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2104 {
2105   rtx new_reg, set;
2106   bool really_new_expansion = false;
2107
2108   set = single_set (insn);
2109   gcc_assert (set);
2110
2111   /* Generate a new register only if the expansion limit has not been
2112      reached.  Else reuse an already existing expansion.  */
2113   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2114     {
2115       really_new_expansion = true;
2116       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2117     }
2118   else
2119     new_reg = get_expansion (ve);
2120
2121   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2122   if (apply_change_group ())
2123     if (really_new_expansion)
2124       {
2125         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2126         ve->expansion_count++;
2127       }
2128 }
2129
2130 /* Initialize the variable expansions in loop preheader.  PLACE is the
2131    loop-preheader basic block where the initialization of the
2132    expansions should take place.  The expansions are initialized with
2133    (-0) when the operation is plus or minus to honor sign zero.  This
2134    way we can prevent cases where the sign of the final result is
2135    effected by the sign of the expansion.  Here is an example to
2136    demonstrate this:
2137
2138    for (i = 0 ; i < n; i++)
2139      sum += something;
2140
2141    ==>
2142
2143    sum += something
2144    ....
2145    i = i+1;
2146    sum1 += something
2147    ....
2148    i = i+1
2149    sum2 += something;
2150    ....
2151
2152    When SUM is initialized with -zero and SOMETHING is also -zero; the
2153    final result of sum should be -zero thus the expansions sum1 and sum2
2154    should be initialized with -zero as well (otherwise we will get +zero
2155    as the final result).  */
2156
2157 static void
2158 insert_var_expansion_initialization (struct var_to_expand *ve,
2159                                      basic_block place)
2160 {
2161   rtx seq, var, zero_init;
2162   unsigned i;
2163   enum machine_mode mode = GET_MODE (ve->reg);
2164   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2165
2166   if (VEC_length (rtx, ve->var_expansions) == 0)
2167     return;
2168
2169   start_sequence ();
2170   switch (ve->op)
2171     {
2172     case FMA:
2173       /* Note that we only accumulate FMA via the ADD operand.  */
2174     case PLUS:
2175     case MINUS:
2176       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2177         {
2178           if (honor_signed_zero_p)
2179             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2180           else
2181             zero_init = CONST0_RTX (mode);
2182           emit_move_insn (var, zero_init);
2183         }
2184       break;
2185
2186     case MULT:
2187       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2188         {
2189           zero_init = CONST1_RTX (GET_MODE (var));
2190           emit_move_insn (var, zero_init);
2191         }
2192       break;
2193
2194     default:
2195       gcc_unreachable ();
2196     }
2197
2198   seq = get_insns ();
2199   end_sequence ();
2200
2201   emit_insn_after (seq, BB_END (place));
2202 }
2203
2204 /* Combine the variable expansions at the loop exit.  PLACE is the
2205    loop exit basic block where the summation of the expansions should
2206    take place.  */
2207
2208 static void
2209 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2210 {
2211   rtx sum = ve->reg;
2212   rtx expr, seq, var, insn;
2213   unsigned i;
2214
2215   if (VEC_length (rtx, ve->var_expansions) == 0)
2216     return;
2217
2218   start_sequence ();
2219   switch (ve->op)
2220     {
2221     case FMA:
2222       /* Note that we only accumulate FMA via the ADD operand.  */
2223     case PLUS:
2224     case MINUS:
2225       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2226         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2227       break;
2228
2229     case MULT:
2230       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2231         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2232       break;
2233
2234     default:
2235       gcc_unreachable ();
2236     }
2237
2238   expr = force_operand (sum, ve->reg);
2239   if (expr != ve->reg)
2240     emit_move_insn (ve->reg, expr);
2241   seq = get_insns ();
2242   end_sequence ();
2243
2244   insn = BB_HEAD (place);
2245   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2246     insn = NEXT_INSN (insn);
2247
2248   emit_insn_after (seq, insn);
2249 }
2250
2251 /* Apply loop optimizations in loop copies using the
2252    data which gathered during the unrolling.  Structure
2253    OPT_INFO record that data.
2254
2255    UNROLLING is true if we unrolled (not peeled) the loop.
2256    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2257    the loop (as it should happen in complete unrolling, but not in ordinary
2258    peeling of the loop).  */
2259
2260 static void
2261 apply_opt_in_copies (struct opt_info *opt_info,
2262                      unsigned n_copies, bool unrolling,
2263                      bool rewrite_original_loop)
2264 {
2265   unsigned i, delta;
2266   basic_block bb, orig_bb;
2267   rtx insn, orig_insn, next;
2268   struct iv_to_split ivts_templ, *ivts;
2269   struct var_to_expand ve_templ, *ves;
2270
2271   /* Sanity check -- we need to put initialization in the original loop
2272      body.  */
2273   gcc_assert (!unrolling || rewrite_original_loop);
2274
2275   /* Allocate the basic variables (i0).  */
2276   if (opt_info->insns_to_split)
2277     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2278       allocate_basic_variable (ivts);
2279
2280   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2281     {
2282       bb = BASIC_BLOCK (i);
2283       orig_bb = get_bb_original (bb);
2284
2285       /* bb->aux holds position in copy sequence initialized by
2286          duplicate_loop_to_header_edge.  */
2287       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2288                                         unrolling);
2289       bb->aux = 0;
2290       orig_insn = BB_HEAD (orig_bb);
2291       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2292         {
2293           next = NEXT_INSN (insn);
2294           if (!INSN_P (insn)
2295               || (DEBUG_INSN_P (insn)
2296                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2297             continue;
2298
2299           while (!INSN_P (orig_insn)
2300                  || (DEBUG_INSN_P (orig_insn)
2301                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2302                          == LABEL_DECL)))
2303             orig_insn = NEXT_INSN (orig_insn);
2304
2305           ivts_templ.insn = orig_insn;
2306           ve_templ.insn = orig_insn;
2307
2308           /* Apply splitting iv optimization.  */
2309           if (opt_info->insns_to_split)
2310             {
2311               ivts = (struct iv_to_split *)
2312                 htab_find (opt_info->insns_to_split, &ivts_templ);
2313
2314               if (ivts)
2315                 {
2316                   gcc_assert (GET_CODE (PATTERN (insn))
2317                               == GET_CODE (PATTERN (orig_insn)));
2318
2319                   if (!delta)
2320                     insert_base_initialization (ivts, insn);
2321                   split_iv (ivts, insn, delta);
2322                 }
2323             }
2324           /* Apply variable expansion optimization.  */
2325           if (unrolling && opt_info->insns_with_var_to_expand)
2326             {
2327               ves = (struct var_to_expand *)
2328                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2329               if (ves)
2330                 {
2331                   gcc_assert (GET_CODE (PATTERN (insn))
2332                               == GET_CODE (PATTERN (orig_insn)));
2333                   expand_var_during_unrolling (ves, insn);
2334                 }
2335             }
2336           orig_insn = NEXT_INSN (orig_insn);
2337         }
2338     }
2339
2340   if (!rewrite_original_loop)
2341     return;
2342
2343   /* Initialize the variable expansions in the loop preheader
2344      and take care of combining them at the loop exit.  */
2345   if (opt_info->insns_with_var_to_expand)
2346     {
2347       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2348         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2349       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2350         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2351     }
2352
2353   /* Rewrite also the original loop body.  Find them as originals of the blocks
2354      in the last copied iteration, i.e. those that have
2355      get_bb_copy (get_bb_original (bb)) == bb.  */
2356   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2357     {
2358       bb = BASIC_BLOCK (i);
2359       orig_bb = get_bb_original (bb);
2360       if (get_bb_copy (orig_bb) != bb)
2361         continue;
2362
2363       delta = determine_split_iv_delta (0, n_copies, unrolling);
2364       for (orig_insn = BB_HEAD (orig_bb);
2365            orig_insn != NEXT_INSN (BB_END (bb));
2366            orig_insn = next)
2367         {
2368           next = NEXT_INSN (orig_insn);
2369
2370           if (!INSN_P (orig_insn))
2371             continue;
2372
2373           ivts_templ.insn = orig_insn;
2374           if (opt_info->insns_to_split)
2375             {
2376               ivts = (struct iv_to_split *)
2377                 htab_find (opt_info->insns_to_split, &ivts_templ);
2378               if (ivts)
2379                 {
2380                   if (!delta)
2381                     insert_base_initialization (ivts, orig_insn);
2382                   split_iv (ivts, orig_insn, delta);
2383                   continue;
2384                 }
2385             }
2386
2387         }
2388     }
2389 }
2390
2391 /* Release OPT_INFO.  */
2392
2393 static void
2394 free_opt_info (struct opt_info *opt_info)
2395 {
2396   if (opt_info->insns_to_split)
2397     htab_delete (opt_info->insns_to_split);
2398   if (opt_info->insns_with_var_to_expand)
2399     {
2400       struct var_to_expand *ves;
2401
2402       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2403         VEC_free (rtx, heap, ves->var_expansions);
2404       htab_delete (opt_info->insns_with_var_to_expand);
2405     }
2406   free (opt_info);
2407 }