gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103   unsigned accum_pos;              /* The position in which the accumulator is placed in
 104                                       the insn src.  For example in x = x + something
 105                                       accum_pos is 0 while in x = something + x accum_pos
 106                                       is 1.  */
 107 };
 108
 109 /* Information about optimization applied in
 110    the unrolled loop.  */
 111
 112 struct opt_info
 113 {
 114   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 115   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 116   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 117   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 118                                       to expand.  */
 119   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 120   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 121   unsigned first_new_block;        /* The first basic block that was
 122                                       duplicated.  */
 123   basic_block loop_exit;           /* The loop exit basic block.  */
 124   basic_block loop_preheader;      /* The loop preheader basic block.  */
 125 };
 126
 127 static void decide_unrolling_and_peeling (int);
 128 static void peel_loops_completely (int);
 129 static void decide_peel_simple (struct loop *, int);
 130 static void decide_peel_once_rolling (struct loop *, int);
 131 static void decide_peel_completely (struct loop *, int);
 132 static void decide_unroll_stupid (struct loop *, int);
 133 static void decide_unroll_constant_iterations (struct loop *, int);
 134 static void decide_unroll_runtime_iterations (struct loop *, int);
 135 static void peel_loop_simple (struct loop *);
 136 static void peel_loop_completely (struct loop *);
 137 static void unroll_loop_stupid (struct loop *);
 138 static void unroll_loop_constant_iterations (struct loop *);
 139 static void unroll_loop_runtime_iterations (struct loop *);
 140 static struct opt_info *analyze_insns_in_loop (struct loop *);
 141 static void opt_info_start_duplication (struct opt_info *);
 142 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 143 static void free_opt_info (struct opt_info *);
 144 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 145 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 146 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 147 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 148 static void insert_var_expansion_initialization (struct var_to_expand *,
 149                                                  basic_block);
 150 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 151                                              basic_block);
 152 static rtx get_expansion (struct var_to_expand *);
 153
 154 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 155 void
 156 unroll_and_peel_loops (int flags)
 157 {
 158   struct loop *loop;
 159   bool check;
 160   loop_iterator li;
 161
 162   /* First perform complete loop peeling (it is almost surely a win,
 163      and affects parameters for further decision a lot).  */
 164   peel_loops_completely (flags);
 165
 166   /* Now decide rest of unrolling and peeling.  */
 167   decide_unrolling_and_peeling (flags);
 168
 169   /* Scan the loops, inner ones first.  */
 170   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 171     {
 172       check = true;
 173       /* And perform the appropriate transformations.  */
 174       switch (loop->lpt_decision.decision)
 175         {
 176         case LPT_PEEL_COMPLETELY:
 177           /* Already done.  */
 178           gcc_unreachable ();
 179         case LPT_PEEL_SIMPLE:
 180           peel_loop_simple (loop);
 181           break;
 182         case LPT_UNROLL_CONSTANT:
 183           unroll_loop_constant_iterations (loop);
 184           break;
 185         case LPT_UNROLL_RUNTIME:
 186           unroll_loop_runtime_iterations (loop);
 187           break;
 188         case LPT_UNROLL_STUPID:
 189           unroll_loop_stupid (loop);
 190           break;
 191         case LPT_NONE:
 192           check = false;
 193           break;
 194         default:
 195           gcc_unreachable ();
 196         }
 197       if (check)
 198         {
 199 #ifdef ENABLE_CHECKING
 200           verify_loop_structure ();
 201 #endif
 202         }
 203     }
 204
 205   iv_analysis_done ();
 206 }
 207
 208 /* Check whether exit of the LOOP is at the end of loop body.  */
 209
 210 static bool
 211 loop_exit_at_end_p (struct loop *loop)
 212 {
 213   struct niter_desc *desc = get_simple_loop_desc (loop);
 214   rtx insn;
 215
 216   if (desc->in_edge->dest != loop->latch)
 217     return false;
 218
 219   /* Check that the latch is empty.  */
 220   FOR_BB_INSNS (loop->latch, insn)
 221     {
 222       if (INSN_P (insn))
 223         return false;
 224     }
 225
 226   return true;
 227 }
 228
 229 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 230 static void
 231 peel_loops_completely (int flags)
 232 {
 233   struct loop *loop;
 234   loop_iterator li;
 235
 236   /* Scan the loops, the inner ones first.  */
 237   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 238     {
 239       loop->lpt_decision.decision = LPT_NONE;
 240
 241       if (dump_file)
 242         fprintf (dump_file,
 243                  "\n;; *** Considering loop %d for complete peeling ***\n",
 244                  loop->num);
 245
 246       loop->ninsns = num_loop_insns (loop);
 247
 248       decide_peel_once_rolling (loop, flags);
 249       if (loop->lpt_decision.decision == LPT_NONE)
 250         decide_peel_completely (loop, flags);
 251
 252       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 253         {
 254           peel_loop_completely (loop);
 255 #ifdef ENABLE_CHECKING
 256           verify_loop_structure ();
 257 #endif
 258         }
 259     }
 260 }
 261
 262 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 263 static void
 264 decide_unrolling_and_peeling (int flags)
 265 {
 266   struct loop *loop;
 267   loop_iterator li;
 268
 269   /* Scan the loops, inner ones first.  */
 270   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 271     {
 272       loop->lpt_decision.decision = LPT_NONE;
 273
 274       if (dump_file)
 275         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 276
 277       /* Do not peel cold areas.  */
 278       if (optimize_loop_for_size_p (loop))
 279         {
 280           if (dump_file)
 281             fprintf (dump_file, ";; Not considering loop, cold area\n");
 282           continue;
 283         }
 284
 285       /* Can the loop be manipulated?  */
 286       if (!can_duplicate_loop_p (loop))
 287         {
 288           if (dump_file)
 289             fprintf (dump_file,
 290                      ";; Not considering loop, cannot duplicate\n");
 291           continue;
 292         }
 293
 294       /* Skip non-innermost loops.  */
 295       if (loop->inner)
 296         {
 297           if (dump_file)
 298             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 299           continue;
 300         }
 301
 302       loop->ninsns = num_loop_insns (loop);
 303       loop->av_ninsns = average_num_loop_insns (loop);
 304
 305       /* Try transformations one by one in decreasing order of
 306          priority.  */
 307
 308       decide_unroll_constant_iterations (loop, flags);
 309       if (loop->lpt_decision.decision == LPT_NONE)
 310         decide_unroll_runtime_iterations (loop, flags);
 311       if (loop->lpt_decision.decision == LPT_NONE)
 312         decide_unroll_stupid (loop, flags);
 313       if (loop->lpt_decision.decision == LPT_NONE)
 314         decide_peel_simple (loop, flags);
 315     }
 316 }
 317
 318 /* Decide whether the LOOP is once rolling and suitable for complete
 319    peeling.  */
 320 static void
 321 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 322 {
 323   struct niter_desc *desc;
 324
 325   if (dump_file)
 326     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 327
 328   /* Is the loop small enough?  */
 329   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 330     {
 331       if (dump_file)
 332         fprintf (dump_file, ";; Not considering loop, is too big\n");
 333       return;
 334     }
 335
 336   /* Check for simple loops.  */
 337   desc = get_simple_loop_desc (loop);
 338
 339   /* Check number of iterations.  */
 340   if (!desc->simple_p
 341       || desc->assumptions
 342       || desc->infinite
 343       || !desc->const_iter
 344       || (desc->niter != 0
 345           && max_loop_iterations_int (loop) != 0))
 346     {
 347       if (dump_file)
 348         fprintf (dump_file,
 349                  ";; Unable to prove that the loop rolls exactly once\n");
 350       return;
 351     }
 352
 353   /* Success.  */
 354   if (dump_file)
 355     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 356   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 357 }
 358
 359 /* Decide whether the LOOP is suitable for complete peeling.  */
 360 static void
 361 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 362 {
 363   unsigned npeel;
 364   struct niter_desc *desc;
 365
 366   if (dump_file)
 367     fprintf (dump_file, "\n;; Considering peeling completely\n");
 368
 369   /* Skip non-innermost loops.  */
 370   if (loop->inner)
 371     {
 372       if (dump_file)
 373         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 374       return;
 375     }
 376
 377   /* Do not peel cold areas.  */
 378   if (optimize_loop_for_size_p (loop))
 379     {
 380       if (dump_file)
 381         fprintf (dump_file, ";; Not considering loop, cold area\n");
 382       return;
 383     }
 384
 385   /* Can the loop be manipulated?  */
 386   if (!can_duplicate_loop_p (loop))
 387     {
 388       if (dump_file)
 389         fprintf (dump_file,
 390                  ";; Not considering loop, cannot duplicate\n");
 391       return;
 392     }
 393
 394   /* npeel = number of iterations to peel.  */
 395   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 396   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 397     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 398
 399   /* Is the loop small enough?  */
 400   if (!npeel)
 401     {
 402       if (dump_file)
 403         fprintf (dump_file, ";; Not considering loop, is too big\n");
 404       return;
 405     }
 406
 407   /* Check for simple loops.  */
 408   desc = get_simple_loop_desc (loop);
 409
 410   /* Check number of iterations.  */
 411   if (!desc->simple_p
 412       || desc->assumptions
 413       || !desc->const_iter
 414       || desc->infinite)
 415     {
 416       if (dump_file)
 417         fprintf (dump_file,
 418                  ";; Unable to prove that the loop iterates constant times\n");
 419       return;
 420     }
 421
 422   if (desc->niter > npeel - 1)
 423     {
 424       if (dump_file)
 425         {
 426           fprintf (dump_file,
 427                    ";; Not peeling loop completely, rolls too much (");
 428           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 429           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 430         }
 431       return;
 432     }
 433
 434   /* Success.  */
 435   if (dump_file)
 436     fprintf (dump_file, ";; Decided to peel loop completely\n");
 437   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 438 }
 439
 440 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 441    completely.  The transformation done:
 442
 443    for (i = 0; i < 4; i++)
 444      body;
 445
 446    ==>
 447
 448    i = 0;
 449    body; i++;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    */
 454 static void
 455 peel_loop_completely (struct loop *loop)
 456 {
 457   sbitmap wont_exit;
 458   unsigned HOST_WIDE_INT npeel;
 459   unsigned i;
 460   VEC (edge, heap) *remove_edges;
 461   edge ein;
 462   struct niter_desc *desc = get_simple_loop_desc (loop);
 463   struct opt_info *opt_info = NULL;
 464
 465   npeel = desc->niter;
 466
 467   if (npeel)
 468     {
 469       bool ok;
 470
 471       wont_exit = sbitmap_alloc (npeel + 1);
 472       sbitmap_ones (wont_exit);
 473       RESET_BIT (wont_exit, 0);
 474       if (desc->noloop_assumptions)
 475         RESET_BIT (wont_exit, 1);
 476
 477       remove_edges = NULL;
 478
 479       if (flag_split_ivs_in_unroller)
 480         opt_info = analyze_insns_in_loop (loop);
 481
 482       opt_info_start_duplication (opt_info);
 483       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 484                                           npeel,
 485                                           wont_exit, desc->out_edge,
 486                                           &remove_edges,
 487                                           DLTHE_FLAG_UPDATE_FREQ
 488                                           | DLTHE_FLAG_COMPLETTE_PEEL
 489                                           | (opt_info
 490                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 491       gcc_assert (ok);
 492
 493       free (wont_exit);
 494
 495       if (opt_info)
 496         {
 497           apply_opt_in_copies (opt_info, npeel, false, true);
 498           free_opt_info (opt_info);
 499         }
 500
 501       /* Remove the exit edges.  */
 502       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 503         remove_path (ein);
 504       VEC_free (edge, heap, remove_edges);
 505     }
 506
 507   ein = desc->in_edge;
 508   free_simple_loop_desc (loop);
 509
 510   /* Now remove the unreachable part of the last iteration and cancel
 511      the loop.  */
 512   remove_path (ein);
 513
 514   if (dump_file)
 515     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 516 }
 517
 518 /* Decide whether to unroll LOOP iterating constant number of times
 519    and how much.  */
 520
 521 static void
 522 decide_unroll_constant_iterations (struct loop *loop, int flags)
 523 {
 524   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 525   struct niter_desc *desc;
 526
 527   if (!(flags & UAP_UNROLL))
 528     {
 529       /* We were not asked to, just return back silently.  */
 530       return;
 531     }
 532
 533   if (dump_file)
 534     fprintf (dump_file,
 535              "\n;; Considering unrolling loop with constant "
 536              "number of iterations\n");
 537
 538   /* nunroll = total number of copies of the original loop body in
 539      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 540   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 541   nunroll_by_av
 542     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 543   if (nunroll > nunroll_by_av)
 544     nunroll = nunroll_by_av;
 545   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 546     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 547
 548   /* Skip big loops.  */
 549   if (nunroll <= 1)
 550     {
 551       if (dump_file)
 552         fprintf (dump_file, ";; Not considering loop, is too big\n");
 553       return;
 554     }
 555
 556   /* Check for simple loops.  */
 557   desc = get_simple_loop_desc (loop);
 558
 559   /* Check number of iterations.  */
 560   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 561     {
 562       if (dump_file)
 563         fprintf (dump_file,
 564                  ";; Unable to prove that the loop iterates constant times\n");
 565       return;
 566     }
 567
 568   /* Check whether the loop rolls enough to consider.  */
 569   if (desc->niter < 2 * nunroll)
 570     {
 571       if (dump_file)
 572         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 573       return;
 574     }
 575
 576   /* Success; now compute number of iterations to unroll.  We alter
 577      nunroll so that as few as possible copies of loop body are
 578      necessary, while still not decreasing the number of unrollings
 579      too much (at most by 1).  */
 580   best_copies = 2 * nunroll + 10;
 581
 582   i = 2 * nunroll + 2;
 583   if (i - 1 >= desc->niter)
 584     i = desc->niter - 2;
 585
 586   for (; i >= nunroll - 1; i--)
 587     {
 588       unsigned exit_mod = desc->niter % (i + 1);
 589
 590       if (!loop_exit_at_end_p (loop))
 591         n_copies = exit_mod + i + 1;
 592       else if (exit_mod != (unsigned) i
 593                || desc->noloop_assumptions != NULL_RTX)
 594         n_copies = exit_mod + i + 2;
 595       else
 596         n_copies = i + 1;
 597
 598       if (n_copies < best_copies)
 599         {
 600           best_copies = n_copies;
 601           best_unroll = i;
 602         }
 603     }
 604
 605   if (dump_file)
 606     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 607              best_unroll + 1, best_copies, nunroll);
 608
 609   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 610   loop->lpt_decision.times = best_unroll;
 611
 612   if (dump_file)
 613     fprintf (dump_file,
 614              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 615              loop->lpt_decision.times);
 616 }
 617
 618 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 619    times.  The transformation does this:
 620
 621    for (i = 0; i < 102; i++)
 622      body;
 623
 624    ==>
 625
 626    i = 0;
 627    body; i++;
 628    body; i++;
 629    while (i < 102)
 630      {
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635      }
 636   */
 637 static void
 638 unroll_loop_constant_iterations (struct loop *loop)
 639 {
 640   unsigned HOST_WIDE_INT niter;
 641   unsigned exit_mod;
 642   sbitmap wont_exit;
 643   unsigned i;
 644   VEC (edge, heap) *remove_edges;
 645   edge e;
 646   unsigned max_unroll = loop->lpt_decision.times;
 647   struct niter_desc *desc = get_simple_loop_desc (loop);
 648   bool exit_at_end = loop_exit_at_end_p (loop);
 649   struct opt_info *opt_info = NULL;
 650   bool ok;
 651
 652   niter = desc->niter;
 653
 654   /* Should not get here (such loop should be peeled instead).  */
 655   gcc_assert (niter > max_unroll + 1);
 656
 657   exit_mod = niter % (max_unroll + 1);
 658
 659   wont_exit = sbitmap_alloc (max_unroll + 1);
 660   sbitmap_ones (wont_exit);
 661
 662   remove_edges = NULL;
 663   if (flag_split_ivs_in_unroller
 664       || flag_variable_expansion_in_unroller)
 665     opt_info = analyze_insns_in_loop (loop);
 666
 667   if (!exit_at_end)
 668     {
 669       /* The exit is not at the end of the loop; leave exit test
 670          in the first copy, so that the loops that start with test
 671          of exit condition have continuous body after unrolling.  */
 672
 673       if (dump_file)
 674         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 675
 676       /* Peel exit_mod iterations.  */
 677       RESET_BIT (wont_exit, 0);
 678       if (desc->noloop_assumptions)
 679         RESET_BIT (wont_exit, 1);
 680
 681       if (exit_mod)
 682         {
 683           opt_info_start_duplication (opt_info);
 684           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 685                                               exit_mod,
 686                                               wont_exit, desc->out_edge,
 687                                               &remove_edges,
 688                                               DLTHE_FLAG_UPDATE_FREQ
 689                                               | (opt_info && exit_mod > 1
 690                                                  ? DLTHE_RECORD_COPY_NUMBER
 691                                                    : 0));
 692           gcc_assert (ok);
 693
 694           if (opt_info && exit_mod > 1)
 695             apply_opt_in_copies (opt_info, exit_mod, false, false);
 696
 697           desc->noloop_assumptions = NULL_RTX;
 698           desc->niter -= exit_mod;
 699           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 700           if (loop->any_estimate
 701               && double_int::from_uhwi (exit_mod).ule
 702                    (loop->nb_iterations_estimate))
 703             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 704           else
 705             loop->any_estimate = false;
 706         }
 707
 708       SET_BIT (wont_exit, 1);
 709     }
 710   else
 711     {
 712       /* Leave exit test in last copy, for the same reason as above if
 713          the loop tests the condition at the end of loop body.  */
 714
 715       if (dump_file)
 716         fprintf (dump_file, ";; Condition on end of loop.\n");
 717
 718       /* We know that niter >= max_unroll + 2; so we do not need to care of
 719          case when we would exit before reaching the loop.  So just peel
 720          exit_mod + 1 iterations.  */
 721       if (exit_mod != max_unroll
 722           || desc->noloop_assumptions)
 723         {
 724           RESET_BIT (wont_exit, 0);
 725           if (desc->noloop_assumptions)
 726             RESET_BIT (wont_exit, 1);
 727
 728           opt_info_start_duplication (opt_info);
 729           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 730                                               exit_mod + 1,
 731                                               wont_exit, desc->out_edge,
 732                                               &remove_edges,
 733                                               DLTHE_FLAG_UPDATE_FREQ
 734                                               | (opt_info && exit_mod > 0
 735                                                  ? DLTHE_RECORD_COPY_NUMBER
 736                                                    : 0));
 737           gcc_assert (ok);
 738
 739           if (opt_info && exit_mod > 0)
 740             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 741
 742           desc->niter -= exit_mod + 1;
 743           if (loop->any_estimate
 744               && double_int::from_uhwi (exit_mod + 1).ule
 745                    (loop->nb_iterations_estimate))
 746             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 747           else
 748             loop->any_estimate = false;
 749           desc->noloop_assumptions = NULL_RTX;
 750
 751           SET_BIT (wont_exit, 0);
 752           SET_BIT (wont_exit, 1);
 753         }
 754
 755       RESET_BIT (wont_exit, max_unroll);
 756     }
 757
 758   /* Now unroll the loop.  */
 759
 760   opt_info_start_duplication (opt_info);
 761   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 762                                       max_unroll,
 763                                       wont_exit, desc->out_edge,
 764                                       &remove_edges,
 765                                       DLTHE_FLAG_UPDATE_FREQ
 766                                       | (opt_info
 767                                          ? DLTHE_RECORD_COPY_NUMBER
 768                                            : 0));
 769   gcc_assert (ok);
 770
 771   if (opt_info)
 772     {
 773       apply_opt_in_copies (opt_info, max_unroll, true, true);
 774       free_opt_info (opt_info);
 775     }
 776
 777   free (wont_exit);
 778
 779   if (exit_at_end)
 780     {
 781       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 782       /* Find a new in and out edge; they are in the last copy we have made.  */
 783
 784       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 785         {
 786           desc->out_edge = EDGE_SUCC (exit_block, 0);
 787           desc->in_edge = EDGE_SUCC (exit_block, 1);
 788         }
 789       else
 790         {
 791           desc->out_edge = EDGE_SUCC (exit_block, 1);
 792           desc->in_edge = EDGE_SUCC (exit_block, 0);
 793         }
 794     }
 795
 796   desc->niter /= max_unroll + 1;
 797   loop->nb_iterations_upper_bound
 798     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (exit_mod
 799                                                                    + 1),
 800                                             FLOOR_DIV_EXPR);
 801   if (loop->any_estimate)
 802     loop->nb_iterations_estimate
 803       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (exit_mod
 804                                                                   + 1),
 805                                            FLOOR_DIV_EXPR);
 806   desc->niter_expr = GEN_INT (desc->niter);
 807
 808   /* Remove the edges.  */
 809   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 810     remove_path (e);
 811   VEC_free (edge, heap, remove_edges);
 812
 813   if (dump_file)
 814     fprintf (dump_file,
 815              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 816              max_unroll, num_loop_insns (loop));
 817 }
 818
 819 /* Decide whether to unroll LOOP iterating runtime computable number of times
 820    and how much.  */
 821 static void
 822 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 823 {
 824   unsigned nunroll, nunroll_by_av, i;
 825   struct niter_desc *desc;
 826   double_int iterations;
 827
 828   if (!(flags & UAP_UNROLL))
 829     {
 830       /* We were not asked to, just return back silently.  */
 831       return;
 832     }
 833
 834   if (dump_file)
 835     fprintf (dump_file,
 836              "\n;; Considering unrolling loop with runtime "
 837              "computable number of iterations\n");
 838
 839   /* nunroll = total number of copies of the original loop body in
 840      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 841   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 842   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 843   if (nunroll > nunroll_by_av)
 844     nunroll = nunroll_by_av;
 845   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 846     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 847
 848   if (targetm.loop_unroll_adjust)
 849     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 850
 851   /* Skip big loops.  */
 852   if (nunroll <= 1)
 853     {
 854       if (dump_file)
 855         fprintf (dump_file, ";; Not considering loop, is too big\n");
 856       return;
 857     }
 858
 859   /* Check for simple loops.  */
 860   desc = get_simple_loop_desc (loop);
 861
 862   /* Check simpleness.  */
 863   if (!desc->simple_p || desc->assumptions)
 864     {
 865       if (dump_file)
 866         fprintf (dump_file,
 867                  ";; Unable to prove that the number of iterations "
 868                  "can be counted in runtime\n");
 869       return;
 870     }
 871
 872   if (desc->const_iter)
 873     {
 874       if (dump_file)
 875         fprintf (dump_file, ";; Loop iterates constant times\n");
 876       return;
 877     }
 878
 879   /* If we have profile feedback, check whether the loop rolls.  */
 880   if ((estimated_loop_iterations (loop, &iterations)
 881        || max_loop_iterations (loop, &iterations))
 882       && iterations.fits_shwi ()
 883       && iterations.to_shwi () <= 2 * nunroll)
 884     {
 885       if (dump_file)
 886         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 887       return;
 888     }
 889
 890   /* Success; now force nunroll to be power of 2, as we are unable to
 891      cope with overflows in computation of number of iterations.  */
 892   for (i = 1; 2 * i <= nunroll; i *= 2)
 893     continue;
 894
 895   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 896   loop->lpt_decision.times = i - 1;
 897
 898   if (dump_file)
 899     fprintf (dump_file,
 900              ";; Decided to unroll the runtime computable "
 901              "times rolling loop, %d times.\n",
 902              loop->lpt_decision.times);
 903 }
 904
 905 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 906    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 907    and NULL is returned instead.  */
 908
 909 basic_block
 910 split_edge_and_insert (edge e, rtx insns)
 911 {
 912   basic_block bb;
 913
 914   if (!insns)
 915     return NULL;
 916   bb = split_edge (e);
 917   emit_insn_after (insns, BB_END (bb));
 918
 919   /* ??? We used to assume that INSNS can contain control flow insns, and
 920      that we had to try to find sub basic blocks in BB to maintain a valid
 921      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 922      and call break_superblocks when going out of cfglayout mode.  But it
 923      turns out that this never happens; and that if it does ever happen,
 924      the TODO_verify_flow at the end of the RTL loop passes would fail.
 925
 926      There are two reasons why we expected we could have control flow insns
 927      in INSNS.  The first is when a comparison has to be done in parts, and
 928      the second is when the number of iterations is computed for loops with
 929      the number of iterations known at runtime.  In both cases, test cases
 930      to get control flow in INSNS appear to be impossible to construct:
 931
 932       * If do_compare_rtx_and_jump needs several branches to do comparison
 933         in a mode that needs comparison by parts, we cannot analyze the
 934         number of iterations of the loop, and we never get to unrolling it.
 935
 936       * The code in expand_divmod that was suspected to cause creation of
 937         branching code seems to be only accessed for signed division.  The
 938         divisions used by # of iterations analysis are always unsigned.
 939         Problems might arise on architectures that emits branching code
 940         for some operations that may appear in the unroller (especially
 941         for division), but we have no such architectures.
 942
 943      Considering all this, it was decided that we should for now assume
 944      that INSNS can in theory contain control flow insns, but in practice
 945      it never does.  So we don't handle the theoretical case, and should
 946      a real failure ever show up, we have a pretty good clue for how to
 947      fix it.  */
 948
 949   return bb;
 950 }
 951
 952 /* Unroll LOOP for that we are able to count number of iterations in runtime
 953    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 954    extra care for case n < 0):
 955
 956    for (i = 0; i < n; i++)
 957      body;
 958
 959    ==>
 960
 961    i = 0;
 962    mod = n % 4;
 963
 964    switch (mod)
 965      {
 966        case 3:
 967          body; i++;
 968        case 2:
 969          body; i++;
 970        case 1:
 971          body; i++;
 972        case 0: ;
 973      }
 974
 975    while (i < n)
 976      {
 977        body; i++;
 978        body; i++;
 979        body; i++;
 980        body; i++;
 981      }
 982    */
 983 static void
 984 unroll_loop_runtime_iterations (struct loop *loop)
 985 {
 986   rtx old_niter, niter, init_code, branch_code, tmp;
 987   unsigned i, j, p;
 988   basic_block preheader, *body, swtch, ezc_swtch;
 989   VEC (basic_block, heap) *dom_bbs;
 990   sbitmap wont_exit;
 991   int may_exit_copy;
 992   unsigned n_peel;
 993   VEC (edge, heap) *remove_edges;
 994   edge e;
 995   bool extra_zero_check, last_may_exit;
 996   unsigned max_unroll = loop->lpt_decision.times;
 997   struct niter_desc *desc = get_simple_loop_desc (loop);
 998   bool exit_at_end = loop_exit_at_end_p (loop);
 999   struct opt_info *opt_info = NULL;
1000   bool ok;
1001
1002   if (flag_split_ivs_in_unroller
1003       || flag_variable_expansion_in_unroller)
1004     opt_info = analyze_insns_in_loop (loop);
1005
1006   /* Remember blocks whose dominators will have to be updated.  */
1007   dom_bbs = NULL;
1008
1009   body = get_loop_body (loop);
1010   for (i = 0; i < loop->num_nodes; i++)
1011     {
1012       VEC (basic_block, heap) *ldom;
1013       basic_block bb;
1014
1015       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1016       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
1017         if (!flow_bb_inside_loop_p (loop, bb))
1018           VEC_safe_push (basic_block, heap, dom_bbs, bb);
1019
1020       VEC_free (basic_block, heap, ldom);
1021     }
1022   free (body);
1023
1024   if (!exit_at_end)
1025     {
1026       /* Leave exit in first copy (for explanation why see comment in
1027          unroll_loop_constant_iterations).  */
1028       may_exit_copy = 0;
1029       n_peel = max_unroll - 1;
1030       extra_zero_check = true;
1031       last_may_exit = false;
1032     }
1033   else
1034     {
1035       /* Leave exit in last copy (for explanation why see comment in
1036          unroll_loop_constant_iterations).  */
1037       may_exit_copy = max_unroll;
1038       n_peel = max_unroll;
1039       extra_zero_check = false;
1040       last_may_exit = true;
1041     }
1042
1043   /* Get expression for number of iterations.  */
1044   start_sequence ();
1045   old_niter = niter = gen_reg_rtx (desc->mode);
1046   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1047   if (tmp != niter)
1048     emit_move_insn (niter, tmp);
1049
1050   /* Count modulo by ANDing it with max_unroll; we use the fact that
1051      the number of unrollings is a power of two, and thus this is correct
1052      even if there is overflow in the computation.  */
1053   niter = expand_simple_binop (desc->mode, AND,
1054                                niter,
1055                                GEN_INT (max_unroll),
1056                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1057
1058   init_code = get_insns ();
1059   end_sequence ();
1060   unshare_all_rtl_in_chain (init_code);
1061
1062   /* Precondition the loop.  */
1063   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1064
1065   remove_edges = NULL;
1066
1067   wont_exit = sbitmap_alloc (max_unroll + 2);
1068
1069   /* Peel the first copy of loop body (almost always we must leave exit test
1070      here; the only exception is when we have extra zero check and the number
1071      of iterations is reliable.  Also record the place of (possible) extra
1072      zero check.  */
1073   sbitmap_zero (wont_exit);
1074   if (extra_zero_check
1075       && !desc->noloop_assumptions)
1076     SET_BIT (wont_exit, 1);
1077   ezc_swtch = loop_preheader_edge (loop)->src;
1078   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1079                                       1, wont_exit, desc->out_edge,
1080                                       &remove_edges,
1081                                       DLTHE_FLAG_UPDATE_FREQ);
1082   gcc_assert (ok);
1083
1084   /* Record the place where switch will be built for preconditioning.  */
1085   swtch = split_edge (loop_preheader_edge (loop));
1086
1087   for (i = 0; i < n_peel; i++)
1088     {
1089       /* Peel the copy.  */
1090       sbitmap_zero (wont_exit);
1091       if (i != n_peel - 1 || !last_may_exit)
1092         SET_BIT (wont_exit, 1);
1093       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1094                                           1, wont_exit, desc->out_edge,
1095                                           &remove_edges,
1096                                           DLTHE_FLAG_UPDATE_FREQ);
1097       gcc_assert (ok);
1098
1099       /* Create item for switch.  */
1100       j = n_peel - i - (extra_zero_check ? 0 : 1);
1101       p = REG_BR_PROB_BASE / (i + 2);
1102
1103       preheader = split_edge (loop_preheader_edge (loop));
1104       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1105                                           block_label (preheader), p,
1106                                           NULL_RTX);
1107
1108       /* We rely on the fact that the compare and jump cannot be optimized out,
1109          and hence the cfg we create is correct.  */
1110       gcc_assert (branch_code != NULL_RTX);
1111
1112       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1113       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1114       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1115       e = make_edge (swtch, preheader,
1116                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1117       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1118       e->probability = p;
1119     }
1120
1121   if (extra_zero_check)
1122     {
1123       /* Add branch for zero iterations.  */
1124       p = REG_BR_PROB_BASE / (max_unroll + 1);
1125       swtch = ezc_swtch;
1126       preheader = split_edge (loop_preheader_edge (loop));
1127       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1128                                           block_label (preheader), p,
1129                                           NULL_RTX);
1130       gcc_assert (branch_code != NULL_RTX);
1131
1132       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1133       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1134       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1135       e = make_edge (swtch, preheader,
1136                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1137       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1138       e->probability = p;
1139     }
1140
1141   /* Recount dominators for outer blocks.  */
1142   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1143
1144   /* And unroll loop.  */
1145
1146   sbitmap_ones (wont_exit);
1147   RESET_BIT (wont_exit, may_exit_copy);
1148   opt_info_start_duplication (opt_info);
1149
1150   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1151                                       max_unroll,
1152                                       wont_exit, desc->out_edge,
1153                                       &remove_edges,
1154                                       DLTHE_FLAG_UPDATE_FREQ
1155                                       | (opt_info
1156                                          ? DLTHE_RECORD_COPY_NUMBER
1157                                            : 0));
1158   gcc_assert (ok);
1159
1160   if (opt_info)
1161     {
1162       apply_opt_in_copies (opt_info, max_unroll, true, true);
1163       free_opt_info (opt_info);
1164     }
1165
1166   free (wont_exit);
1167
1168   if (exit_at_end)
1169     {
1170       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1171       /* Find a new in and out edge; they are in the last copy we have
1172          made.  */
1173
1174       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1175         {
1176           desc->out_edge = EDGE_SUCC (exit_block, 0);
1177           desc->in_edge = EDGE_SUCC (exit_block, 1);
1178         }
1179       else
1180         {
1181           desc->out_edge = EDGE_SUCC (exit_block, 1);
1182           desc->in_edge = EDGE_SUCC (exit_block, 0);
1183         }
1184     }
1185
1186   /* Remove the edges.  */
1187   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1188     remove_path (e);
1189   VEC_free (edge, heap, remove_edges);
1190
1191   /* We must be careful when updating the number of iterations due to
1192      preconditioning and the fact that the value must be valid at entry
1193      of the loop.  After passing through the above code, we see that
1194      the correct new number of iterations is this:  */
1195   gcc_assert (!desc->const_iter);
1196   desc->niter_expr =
1197     simplify_gen_binary (UDIV, desc->mode, old_niter,
1198                          GEN_INT (max_unroll + 1));
1199   loop->nb_iterations_upper_bound
1200     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1201                                                                    + 1),
1202                                             FLOOR_DIV_EXPR);
1203   if (loop->any_estimate)
1204     loop->nb_iterations_estimate
1205       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1206                                                                   + 1),
1207                                            FLOOR_DIV_EXPR);
1208   if (exit_at_end)
1209     {
1210       desc->niter_expr =
1211         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1212       desc->noloop_assumptions = NULL_RTX;
1213       --loop->nb_iterations_upper_bound;
1214       if (loop->any_estimate
1215           && loop->nb_iterations_estimate != double_int_zero)
1216         --loop->nb_iterations_estimate;
1217       else
1218         loop->any_estimate = false;
1219     }
1220
1221   if (dump_file)
1222     fprintf (dump_file,
1223              ";; Unrolled loop %d times, counting # of iterations "
1224              "in runtime, %i insns\n",
1225              max_unroll, num_loop_insns (loop));
1226
1227   VEC_free (basic_block, heap, dom_bbs);
1228 }
1229
1230 /* Decide whether to simply peel LOOP and how much.  */
1231 static void
1232 decide_peel_simple (struct loop *loop, int flags)
1233 {
1234   unsigned npeel;
1235   struct niter_desc *desc;
1236   double_int iterations;
1237
1238   if (!(flags & UAP_PEEL))
1239     {
1240       /* We were not asked to, just return back silently.  */
1241       return;
1242     }
1243
1244   if (dump_file)
1245     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1246
1247   /* npeel = number of iterations to peel.  */
1248   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1249   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1250     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1251
1252   /* Skip big loops.  */
1253   if (!npeel)
1254     {
1255       if (dump_file)
1256         fprintf (dump_file, ";; Not considering loop, is too big\n");
1257       return;
1258     }
1259
1260   /* Check for simple loops.  */
1261   desc = get_simple_loop_desc (loop);
1262
1263   /* Check number of iterations.  */
1264   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1265     {
1266       if (dump_file)
1267         fprintf (dump_file, ";; Loop iterates constant times\n");
1268       return;
1269     }
1270
1271   /* Do not simply peel loops with branches inside -- it increases number
1272      of mispredicts.  */
1273   if (num_loop_branches (loop) > 1)
1274     {
1275       if (dump_file)
1276         fprintf (dump_file, ";; Not peeling, contains branches\n");
1277       return;
1278     }
1279
1280   /* If we have realistic estimate on number of iterations, use it.  */
1281   if (estimated_loop_iterations (loop, &iterations))
1282     {
1283       if (!iterations.fits_shwi ()
1284           || iterations.to_shwi () + 1 > npeel)
1285         {
1286           if (dump_file)
1287             {
1288               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1289               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1290                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1291               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1292                        npeel);
1293             }
1294           return;
1295         }
1296       npeel = iterations.to_shwi () + 1;
1297     }
1298   /* If we have small enough bound on iterations, we can still peel (completely
1299      unroll).  */
1300   else if (max_loop_iterations (loop, &iterations)
1301            && iterations.fits_shwi ()
1302            && iterations.to_shwi () + 1 <= npeel)
1303     npeel = iterations.to_shwi () + 1;
1304   else
1305     {
1306       /* For now we have no good heuristics to decide whether loop peeling
1307          will be effective, so disable it.  */
1308       if (dump_file)
1309         fprintf (dump_file,
1310                  ";; Not peeling loop, no evidence it will be profitable\n");
1311       return;
1312     }
1313
1314   /* Success.  */
1315   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1316   loop->lpt_decision.times = npeel;
1317
1318   if (dump_file)
1319     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1320              loop->lpt_decision.times);
1321 }
1322
1323 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1324    while (cond)
1325      body;
1326
1327    ==>
1328
1329    if (!cond) goto end;
1330    body;
1331    if (!cond) goto end;
1332    body;
1333    while (cond)
1334      body;
1335    end: ;
1336    */
1337 static void
1338 peel_loop_simple (struct loop *loop)
1339 {
1340   sbitmap wont_exit;
1341   unsigned npeel = loop->lpt_decision.times;
1342   struct niter_desc *desc = get_simple_loop_desc (loop);
1343   struct opt_info *opt_info = NULL;
1344   bool ok;
1345
1346   if (flag_split_ivs_in_unroller && npeel > 1)
1347     opt_info = analyze_insns_in_loop (loop);
1348
1349   wont_exit = sbitmap_alloc (npeel + 1);
1350   sbitmap_zero (wont_exit);
1351
1352   opt_info_start_duplication (opt_info);
1353
1354   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1355                                       npeel, wont_exit, NULL,
1356                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1357                                       | (opt_info
1358                                          ? DLTHE_RECORD_COPY_NUMBER
1359                                            : 0));
1360   gcc_assert (ok);
1361
1362   free (wont_exit);
1363
1364   if (opt_info)
1365     {
1366       apply_opt_in_copies (opt_info, npeel, false, false);
1367       free_opt_info (opt_info);
1368     }
1369
1370   if (desc->simple_p)
1371     {
1372       if (desc->const_iter)
1373         {
1374           desc->niter -= npeel;
1375           desc->niter_expr = GEN_INT (desc->niter);
1376           desc->noloop_assumptions = NULL_RTX;
1377         }
1378       else
1379         {
1380           /* We cannot just update niter_expr, as its value might be clobbered
1381              inside loop.  We could handle this by counting the number into
1382              temporary just like we do in runtime unrolling, but it does not
1383              seem worthwhile.  */
1384           free_simple_loop_desc (loop);
1385         }
1386     }
1387   if (dump_file)
1388     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1389 }
1390
1391 /* Decide whether to unroll LOOP stupidly and how much.  */
1392 static void
1393 decide_unroll_stupid (struct loop *loop, int flags)
1394 {
1395   unsigned nunroll, nunroll_by_av, i;
1396   struct niter_desc *desc;
1397   double_int iterations;
1398
1399   if (!(flags & UAP_UNROLL_ALL))
1400     {
1401       /* We were not asked to, just return back silently.  */
1402       return;
1403     }
1404
1405   if (dump_file)
1406     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1407
1408   /* nunroll = total number of copies of the original loop body in
1409      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1410   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1411   nunroll_by_av
1412     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1413   if (nunroll > nunroll_by_av)
1414     nunroll = nunroll_by_av;
1415   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1416     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1417
1418   if (targetm.loop_unroll_adjust)
1419     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1420
1421   /* Skip big loops.  */
1422   if (nunroll <= 1)
1423     {
1424       if (dump_file)
1425         fprintf (dump_file, ";; Not considering loop, is too big\n");
1426       return;
1427     }
1428
1429   /* Check for simple loops.  */
1430   desc = get_simple_loop_desc (loop);
1431
1432   /* Check simpleness.  */
1433   if (desc->simple_p && !desc->assumptions)
1434     {
1435       if (dump_file)
1436         fprintf (dump_file, ";; The loop is simple\n");
1437       return;
1438     }
1439
1440   /* Do not unroll loops with branches inside -- it increases number
1441      of mispredicts.  */
1442   if (num_loop_branches (loop) > 1)
1443     {
1444       if (dump_file)
1445         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1446       return;
1447     }
1448
1449   /* If we have profile feedback, check whether the loop rolls.  */
1450   if ((estimated_loop_iterations (loop, &iterations)
1451        || max_loop_iterations (loop, &iterations))
1452       && iterations.fits_shwi ()
1453       && iterations.to_shwi () <= 2 * nunroll)
1454     {
1455       if (dump_file)
1456         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1457       return;
1458     }
1459
1460   /* Success.  Now force nunroll to be power of 2, as it seems that this
1461      improves results (partially because of better alignments, partially
1462      because of some dark magic).  */
1463   for (i = 1; 2 * i <= nunroll; i *= 2)
1464     continue;
1465
1466   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1467   loop->lpt_decision.times = i - 1;
1468
1469   if (dump_file)
1470     fprintf (dump_file,
1471              ";; Decided to unroll the loop stupidly, %d times.\n",
1472              loop->lpt_decision.times);
1473 }
1474
1475 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1476    while (cond)
1477      body;
1478
1479    ==>
1480
1481    while (cond)
1482      {
1483        body;
1484        if (!cond) break;
1485        body;
1486        if (!cond) break;
1487        body;
1488        if (!cond) break;
1489        body;
1490      }
1491    */
1492 static void
1493 unroll_loop_stupid (struct loop *loop)
1494 {
1495   sbitmap wont_exit;
1496   unsigned nunroll = loop->lpt_decision.times;
1497   struct niter_desc *desc = get_simple_loop_desc (loop);
1498   struct opt_info *opt_info = NULL;
1499   bool ok;
1500
1501   if (flag_split_ivs_in_unroller
1502       || flag_variable_expansion_in_unroller)
1503     opt_info = analyze_insns_in_loop (loop);
1504
1505
1506   wont_exit = sbitmap_alloc (nunroll + 1);
1507   sbitmap_zero (wont_exit);
1508   opt_info_start_duplication (opt_info);
1509
1510   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1511                                       nunroll, wont_exit,
1512                                       NULL, NULL,
1513                                       DLTHE_FLAG_UPDATE_FREQ
1514                                       | (opt_info
1515                                          ? DLTHE_RECORD_COPY_NUMBER
1516                                            : 0));
1517   gcc_assert (ok);
1518
1519   if (opt_info)
1520     {
1521       apply_opt_in_copies (opt_info, nunroll, true, true);
1522       free_opt_info (opt_info);
1523     }
1524
1525   free (wont_exit);
1526
1527   if (desc->simple_p)
1528     {
1529       /* We indeed may get here provided that there are nontrivial assumptions
1530          for a loop to be really simple.  We could update the counts, but the
1531          problem is that we are unable to decide which exit will be taken
1532          (not really true in case the number of iterations is constant,
1533          but noone will do anything with this information, so we do not
1534          worry about it).  */
1535       desc->simple_p = false;
1536     }
1537
1538   if (dump_file)
1539     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1540              nunroll, num_loop_insns (loop));
1541 }
1542
1543 /* A hash function for information about insns to split.  */
1544
1545 static hashval_t
1546 si_info_hash (const void *ivts)
1547 {
1548   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1549 }
1550
1551 /* An equality functions for information about insns to split.  */
1552
1553 static int
1554 si_info_eq (const void *ivts1, const void *ivts2)
1555 {
1556   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1557   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1558
1559   return i1->insn == i2->insn;
1560 }
1561
1562 /* Return a hash for VES, which is really a "var_to_expand *".  */
1563
1564 static hashval_t
1565 ve_info_hash (const void *ves)
1566 {
1567   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1568 }
1569
1570 /* Return true if IVTS1 and IVTS2 (which are really both of type
1571    "var_to_expand *") refer to the same instruction.  */
1572
1573 static int
1574 ve_info_eq (const void *ivts1, const void *ivts2)
1575 {
1576   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1577   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1578
1579   return i1->insn == i2->insn;
1580 }
1581
1582 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1583    Set *DEBUG_USES to the number of debug insns that reference the
1584    variable.  */
1585
1586 bool
1587 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1588                                   int *debug_uses)
1589 {
1590   basic_block *body, bb;
1591   unsigned i;
1592   int count_ref = 0;
1593   rtx insn;
1594
1595   body = get_loop_body (loop);
1596   for (i = 0; i < loop->num_nodes; i++)
1597     {
1598       bb = body[i];
1599
1600       FOR_BB_INSNS (bb, insn)
1601         if (!rtx_referenced_p (reg, insn))
1602           continue;
1603         else if (DEBUG_INSN_P (insn))
1604           ++*debug_uses;
1605         else if (++count_ref > 1)
1606           break;
1607     }
1608   free (body);
1609   return (count_ref  == 1);
1610 }
1611
1612 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1613
1614 static void
1615 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1616 {
1617   basic_block *body, bb;
1618   unsigned i;
1619   rtx insn;
1620
1621   body = get_loop_body (loop);
1622   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1623     {
1624       bb = body[i];
1625
1626       FOR_BB_INSNS (bb, insn)
1627         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1628           continue;
1629         else
1630           {
1631             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1632                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1633             if (!--debug_uses)
1634               break;
1635           }
1636     }
1637   free (body);
1638 }
1639
1640 /* Determine whether INSN contains an accumulator
1641    which can be expanded into separate copies,
1642    one for each copy of the LOOP body.
1643
1644    for (i = 0 ; i < n; i++)
1645      sum += a[i];
1646
1647    ==>
1648
1649    sum += a[i]
1650    ....
1651    i = i+1;
1652    sum1 += a[i]
1653    ....
1654    i = i+1
1655    sum2 += a[i];
1656    ....
1657
1658    Return NULL if INSN contains no opportunity for expansion of accumulator.
1659    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1660    information and return a pointer to it.
1661 */
1662
1663 static struct var_to_expand *
1664 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1665 {
1666   rtx set, dest, src;
1667   struct var_to_expand *ves;
1668   unsigned accum_pos;
1669   enum rtx_code code;
1670   int debug_uses = 0;
1671
1672   set = single_set (insn);
1673   if (!set)
1674     return NULL;
1675
1676   dest = SET_DEST (set);
1677   src = SET_SRC (set);
1678   code = GET_CODE (src);
1679
1680   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1681     return NULL;
1682
1683   if (FLOAT_MODE_P (GET_MODE (dest)))
1684     {
1685       if (!flag_associative_math)
1686         return NULL;
1687       /* In the case of FMA, we're also changing the rounding.  */
1688       if (code == FMA && !flag_unsafe_math_optimizations)
1689         return NULL;
1690     }
1691
1692   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1693      in MD.  But if there is no optab to generate the insn, we can not
1694      perform the variable expansion.  This can happen if an MD provides
1695      an insn but not a named pattern to generate it, for example to avoid
1696      producing code that needs additional mode switches like for x87/mmx.
1697
1698      So we check have_insn_for which looks for an optab for the operation
1699      in SRC.  If it doesn't exist, we can't perform the expansion even
1700      though INSN is valid.  */
1701   if (!have_insn_for (code, GET_MODE (src)))
1702     return NULL;
1703
1704   if (!REG_P (dest)
1705       && !(GET_CODE (dest) == SUBREG
1706            && REG_P (SUBREG_REG (dest))))
1707     return NULL;
1708
1709   /* Find the accumulator use within the operation.  */
1710   if (code == FMA)
1711     {
1712       /* We only support accumulation via FMA in the ADD position.  */
1713       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1714         return NULL;
1715       accum_pos = 2;
1716     }
1717   else if (rtx_equal_p (dest, XEXP (src, 0)))
1718     accum_pos = 0;
1719   else if (rtx_equal_p (dest, XEXP (src, 1)))
1720     {
1721       /* The method of expansion that we are using; which includes the
1722          initialization of the expansions with zero and the summation of
1723          the expansions at the end of the computation will yield wrong
1724          results for (x = something - x) thus avoid using it in that case.  */
1725       if (code == MINUS)
1726         return NULL;
1727       accum_pos = 1;
1728     }
1729   else
1730     return NULL;
1731
1732   /* It must not otherwise be used.  */
1733   if (code == FMA)
1734     {
1735       if (rtx_referenced_p (dest, XEXP (src, 0))
1736           || rtx_referenced_p (dest, XEXP (src, 1)))
1737         return NULL;
1738     }
1739   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1740     return NULL;
1741
1742   /* It must be used in exactly one insn.  */
1743   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1744     return NULL;
1745
1746   if (dump_file)
1747     {
1748       fprintf (dump_file, "\n;; Expanding Accumulator ");
1749       print_rtl (dump_file, dest);
1750       fprintf (dump_file, "\n");
1751     }
1752
1753   if (debug_uses)
1754     /* Instead of resetting the debug insns, we could replace each
1755        debug use in the loop with the sum or product of all expanded
1756        accummulators.  Since we'll only know of all expansions at the
1757        end, we'd have to keep track of which vars_to_expand a debug
1758        insn in the loop references, take note of each copy of the
1759        debug insn during unrolling, and when it's all done, compute
1760        the sum or product of each variable and adjust the original
1761        debug insn and each copy thereof.  What a pain!  */
1762     reset_debug_uses_in_loop (loop, dest, debug_uses);
1763
1764   /* Record the accumulator to expand.  */
1765   ves = XNEW (struct var_to_expand);
1766   ves->insn = insn;
1767   ves->reg = copy_rtx (dest);
1768   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1769   ves->next = NULL;
1770   ves->op = GET_CODE (src);
1771   ves->expansion_count = 0;
1772   ves->reuse_expansion = 0;
1773   ves->accum_pos = accum_pos;
1774   return ves;
1775 }
1776
1777 /* Determine whether there is an induction variable in INSN that
1778    we would like to split during unrolling.
1779
1780    I.e. replace
1781
1782    i = i + 1;
1783    ...
1784    i = i + 1;
1785    ...
1786    i = i + 1;
1787    ...
1788
1789    type chains by
1790
1791    i0 = i + 1
1792    ...
1793    i = i0 + 1
1794    ...
1795    i = i0 + 2
1796    ...
1797
1798    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1799    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1800    pointer to it.  */
1801
1802 static struct iv_to_split *
1803 analyze_iv_to_split_insn (rtx insn)
1804 {
1805   rtx set, dest;
1806   struct rtx_iv iv;
1807   struct iv_to_split *ivts;
1808   bool ok;
1809
1810   /* For now we just split the basic induction variables.  Later this may be
1811      extended for example by selecting also addresses of memory references.  */
1812   set = single_set (insn);
1813   if (!set)
1814     return NULL;
1815
1816   dest = SET_DEST (set);
1817   if (!REG_P (dest))
1818     return NULL;
1819
1820   if (!biv_p (insn, dest))
1821     return NULL;
1822
1823   ok = iv_analyze_result (insn, dest, &iv);
1824
1825   /* This used to be an assert under the assumption that if biv_p returns
1826      true that iv_analyze_result must also return true.  However, that
1827      assumption is not strictly correct as evidenced by pr25569.
1828
1829      Returning NULL when iv_analyze_result returns false is safe and
1830      avoids the problems in pr25569 until the iv_analyze_* routines
1831      can be fixed, which is apparently hard and time consuming
1832      according to their author.  */
1833   if (! ok)
1834     return NULL;
1835
1836   if (iv.step == const0_rtx
1837       || iv.mode != iv.extend_mode)
1838     return NULL;
1839
1840   /* Record the insn to split.  */
1841   ivts = XNEW (struct iv_to_split);
1842   ivts->insn = insn;
1843   ivts->base_var = NULL_RTX;
1844   ivts->step = iv.step;
1845   ivts->next = NULL;
1846   ivts->n_loc = 1;
1847   ivts->loc[0] = 1;
1848
1849   return ivts;
1850 }
1851
1852 /* Determines which of insns in LOOP can be optimized.
1853    Return a OPT_INFO struct with the relevant hash tables filled
1854    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1855    is undefined for the return value.  */
1856
1857 static struct opt_info *
1858 analyze_insns_in_loop (struct loop *loop)
1859 {
1860   basic_block *body, bb;
1861   unsigned i;
1862   struct opt_info *opt_info = XCNEW (struct opt_info);
1863   rtx insn;
1864   struct iv_to_split *ivts = NULL;
1865   struct var_to_expand *ves = NULL;
1866   PTR *slot1;
1867   PTR *slot2;
1868   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1869   edge exit;
1870   bool can_apply = false;
1871
1872   iv_analysis_loop_init (loop);
1873
1874   body = get_loop_body (loop);
1875
1876   if (flag_split_ivs_in_unroller)
1877     {
1878       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1879                                               si_info_hash, si_info_eq, free);
1880       opt_info->iv_to_split_head = NULL;
1881       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1882     }
1883
1884   /* Record the loop exit bb and loop preheader before the unrolling.  */
1885   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1886
1887   if (VEC_length (edge, edges) == 1)
1888     {
1889       exit = VEC_index (edge, edges, 0);
1890       if (!(exit->flags & EDGE_COMPLEX))
1891         {
1892           opt_info->loop_exit = split_edge (exit);
1893           can_apply = true;
1894         }
1895     }
1896
1897   if (flag_variable_expansion_in_unroller
1898       && can_apply)
1899     {
1900       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1901                                                         ve_info_hash,
1902                                                         ve_info_eq, free);
1903       opt_info->var_to_expand_head = NULL;
1904       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1905     }
1906
1907   for (i = 0; i < loop->num_nodes; i++)
1908     {
1909       bb = body[i];
1910       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1911         continue;
1912
1913       FOR_BB_INSNS (bb, insn)
1914       {
1915         if (!INSN_P (insn))
1916           continue;
1917
1918         if (opt_info->insns_to_split)
1919           ivts = analyze_iv_to_split_insn (insn);
1920
1921         if (ivts)
1922           {
1923             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1924             gcc_assert (*slot1 == NULL);
1925             *slot1 = ivts;
1926             *opt_info->iv_to_split_tail = ivts;
1927             opt_info->iv_to_split_tail = &ivts->next;
1928             continue;
1929           }
1930
1931         if (opt_info->insns_with_var_to_expand)
1932           ves = analyze_insn_to_expand_var (loop, insn);
1933
1934         if (ves)
1935           {
1936             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1937             gcc_assert (*slot2 == NULL);
1938             *slot2 = ves;
1939             *opt_info->var_to_expand_tail = ves;
1940             opt_info->var_to_expand_tail = &ves->next;
1941           }
1942       }
1943     }
1944
1945   VEC_free (edge, heap, edges);
1946   free (body);
1947   return opt_info;
1948 }
1949
1950 /* Called just before loop duplication.  Records start of duplicated area
1951    to OPT_INFO.  */
1952
1953 static void
1954 opt_info_start_duplication (struct opt_info *opt_info)
1955 {
1956   if (opt_info)
1957     opt_info->first_new_block = last_basic_block;
1958 }
1959
1960 /* Determine the number of iterations between initialization of the base
1961    variable and the current copy (N_COPY).  N_COPIES is the total number
1962    of newly created copies.  UNROLLING is true if we are unrolling
1963    (not peeling) the loop.  */
1964
1965 static unsigned
1966 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1967 {
1968   if (unrolling)
1969     {
1970       /* If we are unrolling, initialization is done in the original loop
1971          body (number 0).  */
1972       return n_copy;
1973     }
1974   else
1975     {
1976       /* If we are peeling, the copy in that the initialization occurs has
1977          number 1.  The original loop (number 0) is the last.  */
1978       if (n_copy)
1979         return n_copy - 1;
1980       else
1981         return n_copies;
1982     }
1983 }
1984
1985 /* Locate in EXPR the expression corresponding to the location recorded
1986    in IVTS, and return a pointer to the RTX for this location.  */
1987
1988 static rtx *
1989 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1990 {
1991   unsigned i;
1992   rtx *ret = &expr;
1993
1994   for (i = 0; i < ivts->n_loc; i++)
1995     ret = &XEXP (*ret, ivts->loc[i]);
1996
1997   return ret;
1998 }
1999
2000 /* Allocate basic variable for the induction variable chain.  */
2001
2002 static void
2003 allocate_basic_variable (struct iv_to_split *ivts)
2004 {
2005   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2006
2007   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2008 }
2009
2010 /* Insert initialization of basic variable of IVTS before INSN, taking
2011    the initial value from INSN.  */
2012
2013 static void
2014 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2015 {
2016   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2017   rtx seq;
2018
2019   start_sequence ();
2020   expr = force_operand (expr, ivts->base_var);
2021   if (expr != ivts->base_var)
2022     emit_move_insn (ivts->base_var, expr);
2023   seq = get_insns ();
2024   end_sequence ();
2025
2026   emit_insn_before (seq, insn);
2027 }
2028
2029 /* Replace the use of induction variable described in IVTS in INSN
2030    by base variable + DELTA * step.  */
2031
2032 static void
2033 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2034 {
2035   rtx expr, *loc, seq, incr, var;
2036   enum machine_mode mode = GET_MODE (ivts->base_var);
2037   rtx src, dest, set;
2038
2039   /* Construct base + DELTA * step.  */
2040   if (!delta)
2041     expr = ivts->base_var;
2042   else
2043     {
2044       incr = simplify_gen_binary (MULT, mode,
2045                                   ivts->step, gen_int_mode (delta, mode));
2046       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2047                                   ivts->base_var, incr);
2048     }
2049
2050   /* Figure out where to do the replacement.  */
2051   loc = get_ivts_expr (single_set (insn), ivts);
2052
2053   /* If we can make the replacement right away, we're done.  */
2054   if (validate_change (insn, loc, expr, 0))
2055     return;
2056
2057   /* Otherwise, force EXPR into a register and try again.  */
2058   start_sequence ();
2059   var = gen_reg_rtx (mode);
2060   expr = force_operand (expr, var);
2061   if (expr != var)
2062     emit_move_insn (var, expr);
2063   seq = get_insns ();
2064   end_sequence ();
2065   emit_insn_before (seq, insn);
2066
2067   if (validate_change (insn, loc, var, 0))
2068     return;
2069
2070   /* The last chance.  Try recreating the assignment in insn
2071      completely from scratch.  */
2072   set = single_set (insn);
2073   gcc_assert (set);
2074
2075   start_sequence ();
2076   *loc = var;
2077   src = copy_rtx (SET_SRC (set));
2078   dest = copy_rtx (SET_DEST (set));
2079   src = force_operand (src, dest);
2080   if (src != dest)
2081     emit_move_insn (dest, src);
2082   seq = get_insns ();
2083   end_sequence ();
2084
2085   emit_insn_before (seq, insn);
2086   delete_insn (insn);
2087 }
2088
2089
2090 /* Return one expansion of the accumulator recorded in struct VE.  */
2091
2092 static rtx
2093 get_expansion (struct var_to_expand *ve)
2094 {
2095   rtx reg;
2096
2097   if (ve->reuse_expansion == 0)
2098     reg = ve->reg;
2099   else
2100     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2101
2102   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2103     ve->reuse_expansion = 0;
2104   else
2105     ve->reuse_expansion++;
2106
2107   return reg;
2108 }
2109
2110
2111 /* Given INSN replace the uses of the accumulator recorded in VE
2112    with a new register.  */
2113
2114 static void
2115 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2116 {
2117   rtx new_reg, set;
2118   bool really_new_expansion = false;
2119
2120   set = single_set (insn);
2121   gcc_assert (set);
2122
2123   /* Generate a new register only if the expansion limit has not been
2124      reached.  Else reuse an already existing expansion.  */
2125   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2126     {
2127       really_new_expansion = true;
2128       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2129     }
2130   else
2131     new_reg = get_expansion (ve);
2132
2133   validate_change (insn, &SET_DEST (set), new_reg, 1);
2134   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2135
2136   if (apply_change_group ())
2137     if (really_new_expansion)
2138       {
2139         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2140         ve->expansion_count++;
2141       }
2142 }
2143
2144 /* Initialize the variable expansions in loop preheader.  PLACE is the
2145    loop-preheader basic block where the initialization of the
2146    expansions should take place.  The expansions are initialized with
2147    (-0) when the operation is plus or minus to honor sign zero.  This
2148    way we can prevent cases where the sign of the final result is
2149    effected by the sign of the expansion.  Here is an example to
2150    demonstrate this:
2151
2152    for (i = 0 ; i < n; i++)
2153      sum += something;
2154
2155    ==>
2156
2157    sum += something
2158    ....
2159    i = i+1;
2160    sum1 += something
2161    ....
2162    i = i+1
2163    sum2 += something;
2164    ....
2165
2166    When SUM is initialized with -zero and SOMETHING is also -zero; the
2167    final result of sum should be -zero thus the expansions sum1 and sum2
2168    should be initialized with -zero as well (otherwise we will get +zero
2169    as the final result).  */
2170
2171 static void
2172 insert_var_expansion_initialization (struct var_to_expand *ve,
2173                                      basic_block place)
2174 {
2175   rtx seq, var, zero_init, insn;
2176   unsigned i;
2177   enum machine_mode mode = GET_MODE (ve->reg);
2178   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2179
2180   if (VEC_length (rtx, ve->var_expansions) == 0)
2181     return;
2182
2183   start_sequence ();
2184   switch (ve->op)
2185     {
2186     case FMA:
2187       /* Note that we only accumulate FMA via the ADD operand.  */
2188     case PLUS:
2189     case MINUS:
2190       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2191         {
2192           if (honor_signed_zero_p)
2193             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2194           else
2195             zero_init = CONST0_RTX (mode);
2196           emit_move_insn (var, zero_init);
2197         }
2198       break;
2199
2200     case MULT:
2201       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2202         {
2203           zero_init = CONST1_RTX (GET_MODE (var));
2204           emit_move_insn (var, zero_init);
2205         }
2206       break;
2207
2208     default:
2209       gcc_unreachable ();
2210     }
2211
2212   seq = get_insns ();
2213   end_sequence ();
2214
2215   insn = BB_HEAD (place);
2216   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2217     insn = NEXT_INSN (insn);
2218
2219   emit_insn_after (seq, insn);
2220 }
2221
2222 /* Combine the variable expansions at the loop exit.  PLACE is the
2223    loop exit basic block where the summation of the expansions should
2224    take place.  */
2225
2226 static void
2227 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2228 {
2229   rtx sum = ve->reg;
2230   rtx expr, seq, var, insn;
2231   unsigned i;
2232
2233   if (VEC_length (rtx, ve->var_expansions) == 0)
2234     return;
2235
2236   start_sequence ();
2237   switch (ve->op)
2238     {
2239     case FMA:
2240       /* Note that we only accumulate FMA via the ADD operand.  */
2241     case PLUS:
2242     case MINUS:
2243       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2244         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2245       break;
2246
2247     case MULT:
2248       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2249         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2250       break;
2251
2252     default:
2253       gcc_unreachable ();
2254     }
2255
2256   expr = force_operand (sum, ve->reg);
2257   if (expr != ve->reg)
2258     emit_move_insn (ve->reg, expr);
2259   seq = get_insns ();
2260   end_sequence ();
2261
2262   insn = BB_HEAD (place);
2263   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2264     insn = NEXT_INSN (insn);
2265
2266   emit_insn_after (seq, insn);
2267 }
2268
2269 /* Apply loop optimizations in loop copies using the
2270    data which gathered during the unrolling.  Structure
2271    OPT_INFO record that data.
2272
2273    UNROLLING is true if we unrolled (not peeled) the loop.
2274    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2275    the loop (as it should happen in complete unrolling, but not in ordinary
2276    peeling of the loop).  */
2277
2278 static void
2279 apply_opt_in_copies (struct opt_info *opt_info,
2280                      unsigned n_copies, bool unrolling,
2281                      bool rewrite_original_loop)
2282 {
2283   unsigned i, delta;
2284   basic_block bb, orig_bb;
2285   rtx insn, orig_insn, next;
2286   struct iv_to_split ivts_templ, *ivts;
2287   struct var_to_expand ve_templ, *ves;
2288
2289   /* Sanity check -- we need to put initialization in the original loop
2290      body.  */
2291   gcc_assert (!unrolling || rewrite_original_loop);
2292
2293   /* Allocate the basic variables (i0).  */
2294   if (opt_info->insns_to_split)
2295     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2296       allocate_basic_variable (ivts);
2297
2298   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2299     {
2300       bb = BASIC_BLOCK (i);
2301       orig_bb = get_bb_original (bb);
2302
2303       /* bb->aux holds position in copy sequence initialized by
2304          duplicate_loop_to_header_edge.  */
2305       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2306                                         unrolling);
2307       bb->aux = 0;
2308       orig_insn = BB_HEAD (orig_bb);
2309       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2310         {
2311           next = NEXT_INSN (insn);
2312           if (!INSN_P (insn)
2313               || (DEBUG_INSN_P (insn)
2314                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2315             continue;
2316
2317           while (!INSN_P (orig_insn)
2318                  || (DEBUG_INSN_P (orig_insn)
2319                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2320                          == LABEL_DECL)))
2321             orig_insn = NEXT_INSN (orig_insn);
2322
2323           ivts_templ.insn = orig_insn;
2324           ve_templ.insn = orig_insn;
2325
2326           /* Apply splitting iv optimization.  */
2327           if (opt_info->insns_to_split)
2328             {
2329               ivts = (struct iv_to_split *)
2330                 htab_find (opt_info->insns_to_split, &ivts_templ);
2331
2332               if (ivts)
2333                 {
2334                   gcc_assert (GET_CODE (PATTERN (insn))
2335                               == GET_CODE (PATTERN (orig_insn)));
2336
2337                   if (!delta)
2338                     insert_base_initialization (ivts, insn);
2339                   split_iv (ivts, insn, delta);
2340                 }
2341             }
2342           /* Apply variable expansion optimization.  */
2343           if (unrolling && opt_info->insns_with_var_to_expand)
2344             {
2345               ves = (struct var_to_expand *)
2346                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2347               if (ves)
2348                 {
2349                   gcc_assert (GET_CODE (PATTERN (insn))
2350                               == GET_CODE (PATTERN (orig_insn)));
2351                   expand_var_during_unrolling (ves, insn);
2352                 }
2353             }
2354           orig_insn = NEXT_INSN (orig_insn);
2355         }
2356     }
2357
2358   if (!rewrite_original_loop)
2359     return;
2360
2361   /* Initialize the variable expansions in the loop preheader
2362      and take care of combining them at the loop exit.  */
2363   if (opt_info->insns_with_var_to_expand)
2364     {
2365       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2366         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2367       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2368         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2369     }
2370
2371   /* Rewrite also the original loop body.  Find them as originals of the blocks
2372      in the last copied iteration, i.e. those that have
2373      get_bb_copy (get_bb_original (bb)) == bb.  */
2374   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2375     {
2376       bb = BASIC_BLOCK (i);
2377       orig_bb = get_bb_original (bb);
2378       if (get_bb_copy (orig_bb) != bb)
2379         continue;
2380
2381       delta = determine_split_iv_delta (0, n_copies, unrolling);
2382       for (orig_insn = BB_HEAD (orig_bb);
2383            orig_insn != NEXT_INSN (BB_END (bb));
2384            orig_insn = next)
2385         {
2386           next = NEXT_INSN (orig_insn);
2387
2388           if (!INSN_P (orig_insn))
2389             continue;
2390
2391           ivts_templ.insn = orig_insn;
2392           if (opt_info->insns_to_split)
2393             {
2394               ivts = (struct iv_to_split *)
2395                 htab_find (opt_info->insns_to_split, &ivts_templ);
2396               if (ivts)
2397                 {
2398                   if (!delta)
2399                     insert_base_initialization (ivts, orig_insn);
2400                   split_iv (ivts, orig_insn, delta);
2401                   continue;
2402                 }
2403             }
2404
2405         }
2406     }
2407 }
2408
2409 /* Release OPT_INFO.  */
2410
2411 static void
2412 free_opt_info (struct opt_info *opt_info)
2413 {
2414   if (opt_info->insns_to_split)
2415     htab_delete (opt_info->insns_to_split);
2416   if (opt_info->insns_with_var_to_expand)
2417     {
2418       struct var_to_expand *ves;
2419
2420       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2421         VEC_free (rtx, heap, ves->var_expansions);
2422       htab_delete (opt_info->insns_with_var_to_expand);
2423     }
2424   free (opt_info);
2425 }