gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hashtab.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103   unsigned accum_pos;              /* The position in which the accumulator is placed in
 104                                       the insn src.  For example in x = x + something
 105                                       accum_pos is 0 while in x = something + x accum_pos
 106                                       is 1.  */
 107 };
 108
 109 /* Information about optimization applied in
 110    the unrolled loop.  */
 111
 112 struct opt_info
 113 {
 114   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 115   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 116   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 117   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 118                                       to expand.  */
 119   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 120   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 121   unsigned first_new_block;        /* The first basic block that was
 122                                       duplicated.  */
 123   basic_block loop_exit;           /* The loop exit basic block.  */
 124   basic_block loop_preheader;      /* The loop preheader basic block.  */
 125 };
 126
 127 static void decide_unrolling_and_peeling (int);
 128 static void peel_loops_completely (int);
 129 static void decide_peel_simple (struct loop *, int);
 130 static void decide_peel_once_rolling (struct loop *, int);
 131 static void decide_peel_completely (struct loop *, int);
 132 static void decide_unroll_stupid (struct loop *, int);
 133 static void decide_unroll_constant_iterations (struct loop *, int);
 134 static void decide_unroll_runtime_iterations (struct loop *, int);
 135 static void peel_loop_simple (struct loop *);
 136 static void peel_loop_completely (struct loop *);
 137 static void unroll_loop_stupid (struct loop *);
 138 static void unroll_loop_constant_iterations (struct loop *);
 139 static void unroll_loop_runtime_iterations (struct loop *);
 140 static struct opt_info *analyze_insns_in_loop (struct loop *);
 141 static void opt_info_start_duplication (struct opt_info *);
 142 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 143 static void free_opt_info (struct opt_info *);
 144 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 145 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 146 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 147 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 148 static void insert_var_expansion_initialization (struct var_to_expand *,
 149                                                  basic_block);
 150 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 151                                              basic_block);
 152 static rtx get_expansion (struct var_to_expand *);
 153
 154 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 155 void
 156 unroll_and_peel_loops (int flags)
 157 {
 158   struct loop *loop;
 159   bool check;
 160   loop_iterator li;
 161
 162   /* First perform complete loop peeling (it is almost surely a win,
 163      and affects parameters for further decision a lot).  */
 164   peel_loops_completely (flags);
 165
 166   /* Now decide rest of unrolling and peeling.  */
 167   decide_unrolling_and_peeling (flags);
 168
 169   /* Scan the loops, inner ones first.  */
 170   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 171     {
 172       check = true;
 173       /* And perform the appropriate transformations.  */
 174       switch (loop->lpt_decision.decision)
 175         {
 176         case LPT_PEEL_COMPLETELY:
 177           /* Already done.  */
 178           gcc_unreachable ();
 179         case LPT_PEEL_SIMPLE:
 180           peel_loop_simple (loop);
 181           break;
 182         case LPT_UNROLL_CONSTANT:
 183           unroll_loop_constant_iterations (loop);
 184           break;
 185         case LPT_UNROLL_RUNTIME:
 186           unroll_loop_runtime_iterations (loop);
 187           break;
 188         case LPT_UNROLL_STUPID:
 189           unroll_loop_stupid (loop);
 190           break;
 191         case LPT_NONE:
 192           check = false;
 193           break;
 194         default:
 195           gcc_unreachable ();
 196         }
 197       if (check)
 198         {
 199 #ifdef ENABLE_CHECKING
 200           verify_loop_structure ();
 201 #endif
 202         }
 203     }
 204
 205   iv_analysis_done ();
 206 }
 207
 208 /* Check whether exit of the LOOP is at the end of loop body.  */
 209
 210 static bool
 211 loop_exit_at_end_p (struct loop *loop)
 212 {
 213   struct niter_desc *desc = get_simple_loop_desc (loop);
 214   rtx insn;
 215
 216   if (desc->in_edge->dest != loop->latch)
 217     return false;
 218
 219   /* Check that the latch is empty.  */
 220   FOR_BB_INSNS (loop->latch, insn)
 221     {
 222       if (INSN_P (insn))
 223         return false;
 224     }
 225
 226   return true;
 227 }
 228
 229 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 230 static void
 231 peel_loops_completely (int flags)
 232 {
 233   struct loop *loop;
 234   loop_iterator li;
 235
 236   /* Scan the loops, the inner ones first.  */
 237   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 238     {
 239       loop->lpt_decision.decision = LPT_NONE;
 240
 241       if (dump_file)
 242         fprintf (dump_file,
 243                  "\n;; *** Considering loop %d for complete peeling ***\n",
 244                  loop->num);
 245
 246       loop->ninsns = num_loop_insns (loop);
 247
 248       decide_peel_once_rolling (loop, flags);
 249       if (loop->lpt_decision.decision == LPT_NONE)
 250         decide_peel_completely (loop, flags);
 251
 252       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 253         {
 254           peel_loop_completely (loop);
 255 #ifdef ENABLE_CHECKING
 256           verify_loop_structure ();
 257 #endif
 258         }
 259     }
 260 }
 261
 262 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 263 static void
 264 decide_unrolling_and_peeling (int flags)
 265 {
 266   struct loop *loop;
 267   loop_iterator li;
 268
 269   /* Scan the loops, inner ones first.  */
 270   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 271     {
 272       loop->lpt_decision.decision = LPT_NONE;
 273
 274       if (dump_file)
 275         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 276
 277       /* Do not peel cold areas.  */
 278       if (optimize_loop_for_size_p (loop))
 279         {
 280           if (dump_file)
 281             fprintf (dump_file, ";; Not considering loop, cold area\n");
 282           continue;
 283         }
 284
 285       /* Can the loop be manipulated?  */
 286       if (!can_duplicate_loop_p (loop))
 287         {
 288           if (dump_file)
 289             fprintf (dump_file,
 290                      ";; Not considering loop, cannot duplicate\n");
 291           continue;
 292         }
 293
 294       /* Skip non-innermost loops.  */
 295       if (loop->inner)
 296         {
 297           if (dump_file)
 298             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 299           continue;
 300         }
 301
 302       loop->ninsns = num_loop_insns (loop);
 303       loop->av_ninsns = average_num_loop_insns (loop);
 304
 305       /* Try transformations one by one in decreasing order of
 306          priority.  */
 307
 308       decide_unroll_constant_iterations (loop, flags);
 309       if (loop->lpt_decision.decision == LPT_NONE)
 310         decide_unroll_runtime_iterations (loop, flags);
 311       if (loop->lpt_decision.decision == LPT_NONE)
 312         decide_unroll_stupid (loop, flags);
 313       if (loop->lpt_decision.decision == LPT_NONE)
 314         decide_peel_simple (loop, flags);
 315     }
 316 }
 317
 318 /* Decide whether the LOOP is once rolling and suitable for complete
 319    peeling.  */
 320 static void
 321 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 322 {
 323   struct niter_desc *desc;
 324
 325   if (dump_file)
 326     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 327
 328   /* Is the loop small enough?  */
 329   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 330     {
 331       if (dump_file)
 332         fprintf (dump_file, ";; Not considering loop, is too big\n");
 333       return;
 334     }
 335
 336   /* Check for simple loops.  */
 337   desc = get_simple_loop_desc (loop);
 338
 339   /* Check number of iterations.  */
 340   if (!desc->simple_p
 341       || desc->assumptions
 342       || desc->infinite
 343       || !desc->const_iter
 344       || (desc->niter != 0
 345           && max_loop_iterations_int (loop) != 0))
 346     {
 347       if (dump_file)
 348         fprintf (dump_file,
 349                  ";; Unable to prove that the loop rolls exactly once\n");
 350       return;
 351     }
 352
 353   /* Success.  */
 354   if (dump_file)
 355     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 356   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 357 }
 358
 359 /* Decide whether the LOOP is suitable for complete peeling.  */
 360 static void
 361 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 362 {
 363   unsigned npeel;
 364   struct niter_desc *desc;
 365
 366   if (dump_file)
 367     fprintf (dump_file, "\n;; Considering peeling completely\n");
 368
 369   /* Skip non-innermost loops.  */
 370   if (loop->inner)
 371     {
 372       if (dump_file)
 373         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 374       return;
 375     }
 376
 377   /* Do not peel cold areas.  */
 378   if (optimize_loop_for_size_p (loop))
 379     {
 380       if (dump_file)
 381         fprintf (dump_file, ";; Not considering loop, cold area\n");
 382       return;
 383     }
 384
 385   /* Can the loop be manipulated?  */
 386   if (!can_duplicate_loop_p (loop))
 387     {
 388       if (dump_file)
 389         fprintf (dump_file,
 390                  ";; Not considering loop, cannot duplicate\n");
 391       return;
 392     }
 393
 394   /* npeel = number of iterations to peel.  */
 395   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 396   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 397     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 398
 399   /* Is the loop small enough?  */
 400   if (!npeel)
 401     {
 402       if (dump_file)
 403         fprintf (dump_file, ";; Not considering loop, is too big\n");
 404       return;
 405     }
 406
 407   /* Check for simple loops.  */
 408   desc = get_simple_loop_desc (loop);
 409
 410   /* Check number of iterations.  */
 411   if (!desc->simple_p
 412       || desc->assumptions
 413       || !desc->const_iter
 414       || desc->infinite)
 415     {
 416       if (dump_file)
 417         fprintf (dump_file,
 418                  ";; Unable to prove that the loop iterates constant times\n");
 419       return;
 420     }
 421
 422   if (desc->niter > npeel - 1)
 423     {
 424       if (dump_file)
 425         {
 426           fprintf (dump_file,
 427                    ";; Not peeling loop completely, rolls too much (");
 428           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 429           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 430         }
 431       return;
 432     }
 433
 434   /* Success.  */
 435   if (dump_file)
 436     fprintf (dump_file, ";; Decided to peel loop completely\n");
 437   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 438 }
 439
 440 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 441    completely.  The transformation done:
 442
 443    for (i = 0; i < 4; i++)
 444      body;
 445
 446    ==>
 447
 448    i = 0;
 449    body; i++;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    */
 454 static void
 455 peel_loop_completely (struct loop *loop)
 456 {
 457   sbitmap wont_exit;
 458   unsigned HOST_WIDE_INT npeel;
 459   unsigned i;
 460   VEC (edge, heap) *remove_edges;
 461   edge ein;
 462   struct niter_desc *desc = get_simple_loop_desc (loop);
 463   struct opt_info *opt_info = NULL;
 464
 465   npeel = desc->niter;
 466
 467   if (npeel)
 468     {
 469       bool ok;
 470
 471       wont_exit = sbitmap_alloc (npeel + 1);
 472       sbitmap_ones (wont_exit);
 473       RESET_BIT (wont_exit, 0);
 474       if (desc->noloop_assumptions)
 475         RESET_BIT (wont_exit, 1);
 476
 477       remove_edges = NULL;
 478
 479       if (flag_split_ivs_in_unroller)
 480         opt_info = analyze_insns_in_loop (loop);
 481
 482       opt_info_start_duplication (opt_info);
 483       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 484                                           npeel,
 485                                           wont_exit, desc->out_edge,
 486                                           &remove_edges,
 487                                           DLTHE_FLAG_UPDATE_FREQ
 488                                           | DLTHE_FLAG_COMPLETTE_PEEL
 489                                           | (opt_info
 490                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 491       gcc_assert (ok);
 492
 493       free (wont_exit);
 494
 495       if (opt_info)
 496         {
 497           apply_opt_in_copies (opt_info, npeel, false, true);
 498           free_opt_info (opt_info);
 499         }
 500
 501       /* Remove the exit edges.  */
 502       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 503         remove_path (ein);
 504       VEC_free (edge, heap, remove_edges);
 505     }
 506
 507   ein = desc->in_edge;
 508   free_simple_loop_desc (loop);
 509
 510   /* Now remove the unreachable part of the last iteration and cancel
 511      the loop.  */
 512   remove_path (ein);
 513
 514   if (dump_file)
 515     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 516 }
 517
 518 /* Decide whether to unroll LOOP iterating constant number of times
 519    and how much.  */
 520
 521 static void
 522 decide_unroll_constant_iterations (struct loop *loop, int flags)
 523 {
 524   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 525   struct niter_desc *desc;
 526
 527   if (!(flags & UAP_UNROLL))
 528     {
 529       /* We were not asked to, just return back silently.  */
 530       return;
 531     }
 532
 533   if (dump_file)
 534     fprintf (dump_file,
 535              "\n;; Considering unrolling loop with constant "
 536              "number of iterations\n");
 537
 538   /* nunroll = total number of copies of the original loop body in
 539      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 540   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 541   nunroll_by_av
 542     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 543   if (nunroll > nunroll_by_av)
 544     nunroll = nunroll_by_av;
 545   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 546     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 547
 548   /* Skip big loops.  */
 549   if (nunroll <= 1)
 550     {
 551       if (dump_file)
 552         fprintf (dump_file, ";; Not considering loop, is too big\n");
 553       return;
 554     }
 555
 556   /* Check for simple loops.  */
 557   desc = get_simple_loop_desc (loop);
 558
 559   /* Check number of iterations.  */
 560   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 561     {
 562       if (dump_file)
 563         fprintf (dump_file,
 564                  ";; Unable to prove that the loop iterates constant times\n");
 565       return;
 566     }
 567
 568   /* Check whether the loop rolls enough to consider.  */
 569   if (desc->niter < 2 * nunroll)
 570     {
 571       if (dump_file)
 572         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 573       return;
 574     }
 575
 576   /* Success; now compute number of iterations to unroll.  We alter
 577      nunroll so that as few as possible copies of loop body are
 578      necessary, while still not decreasing the number of unrollings
 579      too much (at most by 1).  */
 580   best_copies = 2 * nunroll + 10;
 581
 582   i = 2 * nunroll + 2;
 583   if (i - 1 >= desc->niter)
 584     i = desc->niter - 2;
 585
 586   for (; i >= nunroll - 1; i--)
 587     {
 588       unsigned exit_mod = desc->niter % (i + 1);
 589
 590       if (!loop_exit_at_end_p (loop))
 591         n_copies = exit_mod + i + 1;
 592       else if (exit_mod != (unsigned) i
 593                || desc->noloop_assumptions != NULL_RTX)
 594         n_copies = exit_mod + i + 2;
 595       else
 596         n_copies = i + 1;
 597
 598       if (n_copies < best_copies)
 599         {
 600           best_copies = n_copies;
 601           best_unroll = i;
 602         }
 603     }
 604
 605   if (dump_file)
 606     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 607              best_unroll + 1, best_copies, nunroll);
 608
 609   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 610   loop->lpt_decision.times = best_unroll;
 611
 612   if (dump_file)
 613     fprintf (dump_file,
 614              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 615              loop->lpt_decision.times);
 616 }
 617
 618 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 619    times.  The transformation does this:
 620
 621    for (i = 0; i < 102; i++)
 622      body;
 623
 624    ==>
 625
 626    i = 0;
 627    body; i++;
 628    body; i++;
 629    while (i < 102)
 630      {
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635      }
 636   */
 637 static void
 638 unroll_loop_constant_iterations (struct loop *loop)
 639 {
 640   unsigned HOST_WIDE_INT niter;
 641   unsigned exit_mod;
 642   sbitmap wont_exit;
 643   unsigned i;
 644   VEC (edge, heap) *remove_edges;
 645   edge e;
 646   unsigned max_unroll = loop->lpt_decision.times;
 647   struct niter_desc *desc = get_simple_loop_desc (loop);
 648   bool exit_at_end = loop_exit_at_end_p (loop);
 649   struct opt_info *opt_info = NULL;
 650   bool ok;
 651
 652   niter = desc->niter;
 653
 654   /* Should not get here (such loop should be peeled instead).  */
 655   gcc_assert (niter > max_unroll + 1);
 656
 657   exit_mod = niter % (max_unroll + 1);
 658
 659   wont_exit = sbitmap_alloc (max_unroll + 1);
 660   sbitmap_ones (wont_exit);
 661
 662   remove_edges = NULL;
 663   if (flag_split_ivs_in_unroller
 664       || flag_variable_expansion_in_unroller)
 665     opt_info = analyze_insns_in_loop (loop);
 666
 667   if (!exit_at_end)
 668     {
 669       /* The exit is not at the end of the loop; leave exit test
 670          in the first copy, so that the loops that start with test
 671          of exit condition have continuous body after unrolling.  */
 672
 673       if (dump_file)
 674         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 675
 676       /* Peel exit_mod iterations.  */
 677       RESET_BIT (wont_exit, 0);
 678       if (desc->noloop_assumptions)
 679         RESET_BIT (wont_exit, 1);
 680
 681       if (exit_mod)
 682         {
 683           opt_info_start_duplication (opt_info);
 684           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 685                                               exit_mod,
 686                                               wont_exit, desc->out_edge,
 687                                               &remove_edges,
 688                                               DLTHE_FLAG_UPDATE_FREQ
 689                                               | (opt_info && exit_mod > 1
 690                                                  ? DLTHE_RECORD_COPY_NUMBER
 691                                                    : 0));
 692           gcc_assert (ok);
 693
 694           if (opt_info && exit_mod > 1)
 695             apply_opt_in_copies (opt_info, exit_mod, false, false);
 696
 697           desc->noloop_assumptions = NULL_RTX;
 698           desc->niter -= exit_mod;
 699           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 700           if (loop->any_estimate
 701               && double_int::from_uhwi (exit_mod).ule
 702                    (loop->nb_iterations_estimate))
 703             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 704           else
 705             loop->any_estimate = false;
 706         }
 707
 708       SET_BIT (wont_exit, 1);
 709     }
 710   else
 711     {
 712       /* Leave exit test in last copy, for the same reason as above if
 713          the loop tests the condition at the end of loop body.  */
 714
 715       if (dump_file)
 716         fprintf (dump_file, ";; Condition on end of loop.\n");
 717
 718       /* We know that niter >= max_unroll + 2; so we do not need to care of
 719          case when we would exit before reaching the loop.  So just peel
 720          exit_mod + 1 iterations.  */
 721       if (exit_mod != max_unroll
 722           || desc->noloop_assumptions)
 723         {
 724           RESET_BIT (wont_exit, 0);
 725           if (desc->noloop_assumptions)
 726             RESET_BIT (wont_exit, 1);
 727
 728           opt_info_start_duplication (opt_info);
 729           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 730                                               exit_mod + 1,
 731                                               wont_exit, desc->out_edge,
 732                                               &remove_edges,
 733                                               DLTHE_FLAG_UPDATE_FREQ
 734                                               | (opt_info && exit_mod > 0
 735                                                  ? DLTHE_RECORD_COPY_NUMBER
 736                                                    : 0));
 737           gcc_assert (ok);
 738
 739           if (opt_info && exit_mod > 0)
 740             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 741
 742           desc->niter -= exit_mod + 1;
 743           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 744           if (loop->any_estimate
 745               && double_int::from_uhwi (exit_mod + 1).ule
 746                    (loop->nb_iterations_estimate))
 747             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 748           else
 749             loop->any_estimate = false;
 750           desc->noloop_assumptions = NULL_RTX;
 751
 752           SET_BIT (wont_exit, 0);
 753           SET_BIT (wont_exit, 1);
 754         }
 755
 756       RESET_BIT (wont_exit, max_unroll);
 757     }
 758
 759   /* Now unroll the loop.  */
 760
 761   opt_info_start_duplication (opt_info);
 762   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 763                                       max_unroll,
 764                                       wont_exit, desc->out_edge,
 765                                       &remove_edges,
 766                                       DLTHE_FLAG_UPDATE_FREQ
 767                                       | (opt_info
 768                                          ? DLTHE_RECORD_COPY_NUMBER
 769                                            : 0));
 770   gcc_assert (ok);
 771
 772   if (opt_info)
 773     {
 774       apply_opt_in_copies (opt_info, max_unroll, true, true);
 775       free_opt_info (opt_info);
 776     }
 777
 778   free (wont_exit);
 779
 780   if (exit_at_end)
 781     {
 782       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 783       /* Find a new in and out edge; they are in the last copy we have made.  */
 784
 785       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 786         {
 787           desc->out_edge = EDGE_SUCC (exit_block, 0);
 788           desc->in_edge = EDGE_SUCC (exit_block, 1);
 789         }
 790       else
 791         {
 792           desc->out_edge = EDGE_SUCC (exit_block, 1);
 793           desc->in_edge = EDGE_SUCC (exit_block, 0);
 794         }
 795     }
 796
 797   desc->niter /= max_unroll + 1;
 798   loop->nb_iterations_upper_bound
 799     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 800                                                                    + 1),
 801                                             TRUNC_DIV_EXPR);
 802   if (loop->any_estimate)
 803     loop->nb_iterations_estimate
 804       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 805                                                                   + 1),
 806                                            TRUNC_DIV_EXPR);
 807   desc->niter_expr = GEN_INT (desc->niter);
 808
 809   /* Remove the edges.  */
 810   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 811     remove_path (e);
 812   VEC_free (edge, heap, remove_edges);
 813
 814   if (dump_file)
 815     fprintf (dump_file,
 816              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 817              max_unroll, num_loop_insns (loop));
 818 }
 819
 820 /* Decide whether to unroll LOOP iterating runtime computable number of times
 821    and how much.  */
 822 static void
 823 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 824 {
 825   unsigned nunroll, nunroll_by_av, i;
 826   struct niter_desc *desc;
 827   double_int iterations;
 828
 829   if (!(flags & UAP_UNROLL))
 830     {
 831       /* We were not asked to, just return back silently.  */
 832       return;
 833     }
 834
 835   if (dump_file)
 836     fprintf (dump_file,
 837              "\n;; Considering unrolling loop with runtime "
 838              "computable number of iterations\n");
 839
 840   /* nunroll = total number of copies of the original loop body in
 841      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 842   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 843   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 844   if (nunroll > nunroll_by_av)
 845     nunroll = nunroll_by_av;
 846   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 847     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 848
 849   if (targetm.loop_unroll_adjust)
 850     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 851
 852   /* Skip big loops.  */
 853   if (nunroll <= 1)
 854     {
 855       if (dump_file)
 856         fprintf (dump_file, ";; Not considering loop, is too big\n");
 857       return;
 858     }
 859
 860   /* Check for simple loops.  */
 861   desc = get_simple_loop_desc (loop);
 862
 863   /* Check simpleness.  */
 864   if (!desc->simple_p || desc->assumptions)
 865     {
 866       if (dump_file)
 867         fprintf (dump_file,
 868                  ";; Unable to prove that the number of iterations "
 869                  "can be counted in runtime\n");
 870       return;
 871     }
 872
 873   if (desc->const_iter)
 874     {
 875       if (dump_file)
 876         fprintf (dump_file, ";; Loop iterates constant times\n");
 877       return;
 878     }
 879
 880   /* Check whether the loop rolls.  */
 881   if ((estimated_loop_iterations (loop, &iterations)
 882        || max_loop_iterations (loop, &iterations))
 883       && iterations.ult (double_int::from_shwi (2 * nunroll)))
 884     {
 885       if (dump_file)
 886         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 887       return;
 888     }
 889
 890   /* Success; now force nunroll to be power of 2, as we are unable to
 891      cope with overflows in computation of number of iterations.  */
 892   for (i = 1; 2 * i <= nunroll; i *= 2)
 893     continue;
 894
 895   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 896   loop->lpt_decision.times = i - 1;
 897
 898   if (dump_file)
 899     fprintf (dump_file,
 900              ";; Decided to unroll the runtime computable "
 901              "times rolling loop, %d times.\n",
 902              loop->lpt_decision.times);
 903 }
 904
 905 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 906    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 907    and NULL is returned instead.  */
 908
 909 basic_block
 910 split_edge_and_insert (edge e, rtx insns)
 911 {
 912   basic_block bb;
 913
 914   if (!insns)
 915     return NULL;
 916   bb = split_edge (e);
 917   emit_insn_after (insns, BB_END (bb));
 918
 919   /* ??? We used to assume that INSNS can contain control flow insns, and
 920      that we had to try to find sub basic blocks in BB to maintain a valid
 921      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 922      and call break_superblocks when going out of cfglayout mode.  But it
 923      turns out that this never happens; and that if it does ever happen,
 924      the TODO_verify_flow at the end of the RTL loop passes would fail.
 925
 926      There are two reasons why we expected we could have control flow insns
 927      in INSNS.  The first is when a comparison has to be done in parts, and
 928      the second is when the number of iterations is computed for loops with
 929      the number of iterations known at runtime.  In both cases, test cases
 930      to get control flow in INSNS appear to be impossible to construct:
 931
 932       * If do_compare_rtx_and_jump needs several branches to do comparison
 933         in a mode that needs comparison by parts, we cannot analyze the
 934         number of iterations of the loop, and we never get to unrolling it.
 935
 936       * The code in expand_divmod that was suspected to cause creation of
 937         branching code seems to be only accessed for signed division.  The
 938         divisions used by # of iterations analysis are always unsigned.
 939         Problems might arise on architectures that emits branching code
 940         for some operations that may appear in the unroller (especially
 941         for division), but we have no such architectures.
 942
 943      Considering all this, it was decided that we should for now assume
 944      that INSNS can in theory contain control flow insns, but in practice
 945      it never does.  So we don't handle the theoretical case, and should
 946      a real failure ever show up, we have a pretty good clue for how to
 947      fix it.  */
 948
 949   return bb;
 950 }
 951
 952 /* Unroll LOOP for that we are able to count number of iterations in runtime
 953    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 954    extra care for case n < 0):
 955
 956    for (i = 0; i < n; i++)
 957      body;
 958
 959    ==>
 960
 961    i = 0;
 962    mod = n % 4;
 963
 964    switch (mod)
 965      {
 966        case 3:
 967          body; i++;
 968        case 2:
 969          body; i++;
 970        case 1:
 971          body; i++;
 972        case 0: ;
 973      }
 974
 975    while (i < n)
 976      {
 977        body; i++;
 978        body; i++;
 979        body; i++;
 980        body; i++;
 981      }
 982    */
 983 static void
 984 unroll_loop_runtime_iterations (struct loop *loop)
 985 {
 986   rtx old_niter, niter, init_code, branch_code, tmp;
 987   unsigned i, j, p;
 988   basic_block preheader, *body, swtch, ezc_swtch;
 989   VEC (basic_block, heap) *dom_bbs;
 990   sbitmap wont_exit;
 991   int may_exit_copy;
 992   unsigned n_peel;
 993   VEC (edge, heap) *remove_edges;
 994   edge e;
 995   bool extra_zero_check, last_may_exit;
 996   unsigned max_unroll = loop->lpt_decision.times;
 997   struct niter_desc *desc = get_simple_loop_desc (loop);
 998   bool exit_at_end = loop_exit_at_end_p (loop);
 999   struct opt_info *opt_info = NULL;
1000   bool ok;
1001
1002   if (flag_split_ivs_in_unroller
1003       || flag_variable_expansion_in_unroller)
1004     opt_info = analyze_insns_in_loop (loop);
1005
1006   /* Remember blocks whose dominators will have to be updated.  */
1007   dom_bbs = NULL;
1008
1009   body = get_loop_body (loop);
1010   for (i = 0; i < loop->num_nodes; i++)
1011     {
1012       VEC (basic_block, heap) *ldom;
1013       basic_block bb;
1014
1015       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1016       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
1017         if (!flow_bb_inside_loop_p (loop, bb))
1018           VEC_safe_push (basic_block, heap, dom_bbs, bb);
1019
1020       VEC_free (basic_block, heap, ldom);
1021     }
1022   free (body);
1023
1024   if (!exit_at_end)
1025     {
1026       /* Leave exit in first copy (for explanation why see comment in
1027          unroll_loop_constant_iterations).  */
1028       may_exit_copy = 0;
1029       n_peel = max_unroll - 1;
1030       extra_zero_check = true;
1031       last_may_exit = false;
1032     }
1033   else
1034     {
1035       /* Leave exit in last copy (for explanation why see comment in
1036          unroll_loop_constant_iterations).  */
1037       may_exit_copy = max_unroll;
1038       n_peel = max_unroll;
1039       extra_zero_check = false;
1040       last_may_exit = true;
1041     }
1042
1043   /* Get expression for number of iterations.  */
1044   start_sequence ();
1045   old_niter = niter = gen_reg_rtx (desc->mode);
1046   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1047   if (tmp != niter)
1048     emit_move_insn (niter, tmp);
1049
1050   /* Count modulo by ANDing it with max_unroll; we use the fact that
1051      the number of unrollings is a power of two, and thus this is correct
1052      even if there is overflow in the computation.  */
1053   niter = expand_simple_binop (desc->mode, AND,
1054                                niter,
1055                                GEN_INT (max_unroll),
1056                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1057
1058   init_code = get_insns ();
1059   end_sequence ();
1060   unshare_all_rtl_in_chain (init_code);
1061
1062   /* Precondition the loop.  */
1063   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1064
1065   remove_edges = NULL;
1066
1067   wont_exit = sbitmap_alloc (max_unroll + 2);
1068
1069   /* Peel the first copy of loop body (almost always we must leave exit test
1070      here; the only exception is when we have extra zero check and the number
1071      of iterations is reliable.  Also record the place of (possible) extra
1072      zero check.  */
1073   sbitmap_zero (wont_exit);
1074   if (extra_zero_check
1075       && !desc->noloop_assumptions)
1076     SET_BIT (wont_exit, 1);
1077   ezc_swtch = loop_preheader_edge (loop)->src;
1078   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1079                                       1, wont_exit, desc->out_edge,
1080                                       &remove_edges,
1081                                       DLTHE_FLAG_UPDATE_FREQ);
1082   gcc_assert (ok);
1083
1084   /* Record the place where switch will be built for preconditioning.  */
1085   swtch = split_edge (loop_preheader_edge (loop));
1086
1087   for (i = 0; i < n_peel; i++)
1088     {
1089       /* Peel the copy.  */
1090       sbitmap_zero (wont_exit);
1091       if (i != n_peel - 1 || !last_may_exit)
1092         SET_BIT (wont_exit, 1);
1093       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1094                                           1, wont_exit, desc->out_edge,
1095                                           &remove_edges,
1096                                           DLTHE_FLAG_UPDATE_FREQ);
1097       gcc_assert (ok);
1098
1099       /* Create item for switch.  */
1100       j = n_peel - i - (extra_zero_check ? 0 : 1);
1101       p = REG_BR_PROB_BASE / (i + 2);
1102
1103       preheader = split_edge (loop_preheader_edge (loop));
1104       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1105                                           block_label (preheader), p,
1106                                           NULL_RTX);
1107
1108       /* We rely on the fact that the compare and jump cannot be optimized out,
1109          and hence the cfg we create is correct.  */
1110       gcc_assert (branch_code != NULL_RTX);
1111
1112       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1113       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1114       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1115       e = make_edge (swtch, preheader,
1116                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1117       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1118       e->probability = p;
1119     }
1120
1121   if (extra_zero_check)
1122     {
1123       /* Add branch for zero iterations.  */
1124       p = REG_BR_PROB_BASE / (max_unroll + 1);
1125       swtch = ezc_swtch;
1126       preheader = split_edge (loop_preheader_edge (loop));
1127       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1128                                           block_label (preheader), p,
1129                                           NULL_RTX);
1130       gcc_assert (branch_code != NULL_RTX);
1131
1132       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1133       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1134       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1135       e = make_edge (swtch, preheader,
1136                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1137       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1138       e->probability = p;
1139     }
1140
1141   /* Recount dominators for outer blocks.  */
1142   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1143
1144   /* And unroll loop.  */
1145
1146   sbitmap_ones (wont_exit);
1147   RESET_BIT (wont_exit, may_exit_copy);
1148   opt_info_start_duplication (opt_info);
1149
1150   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1151                                       max_unroll,
1152                                       wont_exit, desc->out_edge,
1153                                       &remove_edges,
1154                                       DLTHE_FLAG_UPDATE_FREQ
1155                                       | (opt_info
1156                                          ? DLTHE_RECORD_COPY_NUMBER
1157                                            : 0));
1158   gcc_assert (ok);
1159
1160   if (opt_info)
1161     {
1162       apply_opt_in_copies (opt_info, max_unroll, true, true);
1163       free_opt_info (opt_info);
1164     }
1165
1166   free (wont_exit);
1167
1168   if (exit_at_end)
1169     {
1170       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1171       /* Find a new in and out edge; they are in the last copy we have
1172          made.  */
1173
1174       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1175         {
1176           desc->out_edge = EDGE_SUCC (exit_block, 0);
1177           desc->in_edge = EDGE_SUCC (exit_block, 1);
1178         }
1179       else
1180         {
1181           desc->out_edge = EDGE_SUCC (exit_block, 1);
1182           desc->in_edge = EDGE_SUCC (exit_block, 0);
1183         }
1184     }
1185
1186   /* Remove the edges.  */
1187   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1188     remove_path (e);
1189   VEC_free (edge, heap, remove_edges);
1190
1191   /* We must be careful when updating the number of iterations due to
1192      preconditioning and the fact that the value must be valid at entry
1193      of the loop.  After passing through the above code, we see that
1194      the correct new number of iterations is this:  */
1195   gcc_assert (!desc->const_iter);
1196   desc->niter_expr =
1197     simplify_gen_binary (UDIV, desc->mode, old_niter,
1198                          GEN_INT (max_unroll + 1));
1199   loop->nb_iterations_upper_bound
1200     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1201                                                                    + 1),
1202                                             TRUNC_DIV_EXPR);
1203   if (loop->any_estimate)
1204     loop->nb_iterations_estimate
1205       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1206                                                                   + 1),
1207                                            TRUNC_DIV_EXPR);
1208   if (exit_at_end)
1209     {
1210       desc->niter_expr =
1211         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1212       desc->noloop_assumptions = NULL_RTX;
1213       --loop->nb_iterations_upper_bound;
1214       if (loop->any_estimate
1215           && loop->nb_iterations_estimate != double_int_zero)
1216         --loop->nb_iterations_estimate;
1217       else
1218         loop->any_estimate = false;
1219     }
1220
1221   if (dump_file)
1222     fprintf (dump_file,
1223              ";; Unrolled loop %d times, counting # of iterations "
1224              "in runtime, %i insns\n",
1225              max_unroll, num_loop_insns (loop));
1226
1227   VEC_free (basic_block, heap, dom_bbs);
1228 }
1229
1230 /* Decide whether to simply peel LOOP and how much.  */
1231 static void
1232 decide_peel_simple (struct loop *loop, int flags)
1233 {
1234   unsigned npeel;
1235   struct niter_desc *desc;
1236   double_int iterations;
1237
1238   if (!(flags & UAP_PEEL))
1239     {
1240       /* We were not asked to, just return back silently.  */
1241       return;
1242     }
1243
1244   if (dump_file)
1245     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1246
1247   /* npeel = number of iterations to peel.  */
1248   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1249   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1250     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1251
1252   /* Skip big loops.  */
1253   if (!npeel)
1254     {
1255       if (dump_file)
1256         fprintf (dump_file, ";; Not considering loop, is too big\n");
1257       return;
1258     }
1259
1260   /* Check for simple loops.  */
1261   desc = get_simple_loop_desc (loop);
1262
1263   /* Check number of iterations.  */
1264   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1265     {
1266       if (dump_file)
1267         fprintf (dump_file, ";; Loop iterates constant times\n");
1268       return;
1269     }
1270
1271   /* Do not simply peel loops with branches inside -- it increases number
1272      of mispredicts.  */
1273   if (num_loop_branches (loop) > 1)
1274     {
1275       if (dump_file)
1276         fprintf (dump_file, ";; Not peeling, contains branches\n");
1277       return;
1278     }
1279
1280   /* If we have realistic estimate on number of iterations, use it.  */
1281   if (estimated_loop_iterations (loop, &iterations))
1282     {
1283       if (double_int::from_shwi (npeel).ule (iterations))
1284         {
1285           if (dump_file)
1286             {
1287               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1288               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1289                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1290               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1291                        npeel);
1292             }
1293           return;
1294         }
1295       npeel = iterations.to_shwi () + 1;
1296     }
1297   /* If we have small enough bound on iterations, we can still peel (completely
1298      unroll).  */
1299   else if (max_loop_iterations (loop, &iterations)
1300            && iterations.ult (double_int::from_shwi (npeel)))
1301     npeel = iterations.to_shwi () + 1;
1302   else
1303     {
1304       /* For now we have no good heuristics to decide whether loop peeling
1305          will be effective, so disable it.  */
1306       if (dump_file)
1307         fprintf (dump_file,
1308                  ";; Not peeling loop, no evidence it will be profitable\n");
1309       return;
1310     }
1311
1312   /* Success.  */
1313   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1314   loop->lpt_decision.times = npeel;
1315
1316   if (dump_file)
1317     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1318              loop->lpt_decision.times);
1319 }
1320
1321 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1322    while (cond)
1323      body;
1324
1325    ==>
1326
1327    if (!cond) goto end;
1328    body;
1329    if (!cond) goto end;
1330    body;
1331    while (cond)
1332      body;
1333    end: ;
1334    */
1335 static void
1336 peel_loop_simple (struct loop *loop)
1337 {
1338   sbitmap wont_exit;
1339   unsigned npeel = loop->lpt_decision.times;
1340   struct niter_desc *desc = get_simple_loop_desc (loop);
1341   struct opt_info *opt_info = NULL;
1342   bool ok;
1343
1344   if (flag_split_ivs_in_unroller && npeel > 1)
1345     opt_info = analyze_insns_in_loop (loop);
1346
1347   wont_exit = sbitmap_alloc (npeel + 1);
1348   sbitmap_zero (wont_exit);
1349
1350   opt_info_start_duplication (opt_info);
1351
1352   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1353                                       npeel, wont_exit, NULL,
1354                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1355                                       | (opt_info
1356                                          ? DLTHE_RECORD_COPY_NUMBER
1357                                            : 0));
1358   gcc_assert (ok);
1359
1360   free (wont_exit);
1361
1362   if (opt_info)
1363     {
1364       apply_opt_in_copies (opt_info, npeel, false, false);
1365       free_opt_info (opt_info);
1366     }
1367
1368   if (desc->simple_p)
1369     {
1370       if (desc->const_iter)
1371         {
1372           desc->niter -= npeel;
1373           desc->niter_expr = GEN_INT (desc->niter);
1374           desc->noloop_assumptions = NULL_RTX;
1375         }
1376       else
1377         {
1378           /* We cannot just update niter_expr, as its value might be clobbered
1379              inside loop.  We could handle this by counting the number into
1380              temporary just like we do in runtime unrolling, but it does not
1381              seem worthwhile.  */
1382           free_simple_loop_desc (loop);
1383         }
1384     }
1385   if (dump_file)
1386     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1387 }
1388
1389 /* Decide whether to unroll LOOP stupidly and how much.  */
1390 static void
1391 decide_unroll_stupid (struct loop *loop, int flags)
1392 {
1393   unsigned nunroll, nunroll_by_av, i;
1394   struct niter_desc *desc;
1395   double_int iterations;
1396
1397   if (!(flags & UAP_UNROLL_ALL))
1398     {
1399       /* We were not asked to, just return back silently.  */
1400       return;
1401     }
1402
1403   if (dump_file)
1404     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1405
1406   /* nunroll = total number of copies of the original loop body in
1407      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1408   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1409   nunroll_by_av
1410     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1411   if (nunroll > nunroll_by_av)
1412     nunroll = nunroll_by_av;
1413   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1414     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1415
1416   if (targetm.loop_unroll_adjust)
1417     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1418
1419   /* Skip big loops.  */
1420   if (nunroll <= 1)
1421     {
1422       if (dump_file)
1423         fprintf (dump_file, ";; Not considering loop, is too big\n");
1424       return;
1425     }
1426
1427   /* Check for simple loops.  */
1428   desc = get_simple_loop_desc (loop);
1429
1430   /* Check simpleness.  */
1431   if (desc->simple_p && !desc->assumptions)
1432     {
1433       if (dump_file)
1434         fprintf (dump_file, ";; The loop is simple\n");
1435       return;
1436     }
1437
1438   /* Do not unroll loops with branches inside -- it increases number
1439      of mispredicts.  */
1440   if (num_loop_branches (loop) > 1)
1441     {
1442       if (dump_file)
1443         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1444       return;
1445     }
1446
1447   /* Check whether the loop rolls.  */
1448   if ((estimated_loop_iterations (loop, &iterations)
1449        || max_loop_iterations (loop, &iterations))
1450       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1451     {
1452       if (dump_file)
1453         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1454       return;
1455     }
1456
1457   /* Success.  Now force nunroll to be power of 2, as it seems that this
1458      improves results (partially because of better alignments, partially
1459      because of some dark magic).  */
1460   for (i = 1; 2 * i <= nunroll; i *= 2)
1461     continue;
1462
1463   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1464   loop->lpt_decision.times = i - 1;
1465
1466   if (dump_file)
1467     fprintf (dump_file,
1468              ";; Decided to unroll the loop stupidly, %d times.\n",
1469              loop->lpt_decision.times);
1470 }
1471
1472 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1473    while (cond)
1474      body;
1475
1476    ==>
1477
1478    while (cond)
1479      {
1480        body;
1481        if (!cond) break;
1482        body;
1483        if (!cond) break;
1484        body;
1485        if (!cond) break;
1486        body;
1487      }
1488    */
1489 static void
1490 unroll_loop_stupid (struct loop *loop)
1491 {
1492   sbitmap wont_exit;
1493   unsigned nunroll = loop->lpt_decision.times;
1494   struct niter_desc *desc = get_simple_loop_desc (loop);
1495   struct opt_info *opt_info = NULL;
1496   bool ok;
1497
1498   if (flag_split_ivs_in_unroller
1499       || flag_variable_expansion_in_unroller)
1500     opt_info = analyze_insns_in_loop (loop);
1501
1502
1503   wont_exit = sbitmap_alloc (nunroll + 1);
1504   sbitmap_zero (wont_exit);
1505   opt_info_start_duplication (opt_info);
1506
1507   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1508                                       nunroll, wont_exit,
1509                                       NULL, NULL,
1510                                       DLTHE_FLAG_UPDATE_FREQ
1511                                       | (opt_info
1512                                          ? DLTHE_RECORD_COPY_NUMBER
1513                                            : 0));
1514   gcc_assert (ok);
1515
1516   if (opt_info)
1517     {
1518       apply_opt_in_copies (opt_info, nunroll, true, true);
1519       free_opt_info (opt_info);
1520     }
1521
1522   free (wont_exit);
1523
1524   if (desc->simple_p)
1525     {
1526       /* We indeed may get here provided that there are nontrivial assumptions
1527          for a loop to be really simple.  We could update the counts, but the
1528          problem is that we are unable to decide which exit will be taken
1529          (not really true in case the number of iterations is constant,
1530          but noone will do anything with this information, so we do not
1531          worry about it).  */
1532       desc->simple_p = false;
1533     }
1534
1535   if (dump_file)
1536     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1537              nunroll, num_loop_insns (loop));
1538 }
1539
1540 /* A hash function for information about insns to split.  */
1541
1542 static hashval_t
1543 si_info_hash (const void *ivts)
1544 {
1545   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1546 }
1547
1548 /* An equality functions for information about insns to split.  */
1549
1550 static int
1551 si_info_eq (const void *ivts1, const void *ivts2)
1552 {
1553   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1554   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1555
1556   return i1->insn == i2->insn;
1557 }
1558
1559 /* Return a hash for VES, which is really a "var_to_expand *".  */
1560
1561 static hashval_t
1562 ve_info_hash (const void *ves)
1563 {
1564   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1565 }
1566
1567 /* Return true if IVTS1 and IVTS2 (which are really both of type
1568    "var_to_expand *") refer to the same instruction.  */
1569
1570 static int
1571 ve_info_eq (const void *ivts1, const void *ivts2)
1572 {
1573   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1574   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1575
1576   return i1->insn == i2->insn;
1577 }
1578
1579 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1580    Set *DEBUG_USES to the number of debug insns that reference the
1581    variable.  */
1582
1583 bool
1584 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1585                                   int *debug_uses)
1586 {
1587   basic_block *body, bb;
1588   unsigned i;
1589   int count_ref = 0;
1590   rtx insn;
1591
1592   body = get_loop_body (loop);
1593   for (i = 0; i < loop->num_nodes; i++)
1594     {
1595       bb = body[i];
1596
1597       FOR_BB_INSNS (bb, insn)
1598         if (!rtx_referenced_p (reg, insn))
1599           continue;
1600         else if (DEBUG_INSN_P (insn))
1601           ++*debug_uses;
1602         else if (++count_ref > 1)
1603           break;
1604     }
1605   free (body);
1606   return (count_ref  == 1);
1607 }
1608
1609 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1610
1611 static void
1612 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1613 {
1614   basic_block *body, bb;
1615   unsigned i;
1616   rtx insn;
1617
1618   body = get_loop_body (loop);
1619   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1620     {
1621       bb = body[i];
1622
1623       FOR_BB_INSNS (bb, insn)
1624         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1625           continue;
1626         else
1627           {
1628             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1629                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1630             if (!--debug_uses)
1631               break;
1632           }
1633     }
1634   free (body);
1635 }
1636
1637 /* Determine whether INSN contains an accumulator
1638    which can be expanded into separate copies,
1639    one for each copy of the LOOP body.
1640
1641    for (i = 0 ; i < n; i++)
1642      sum += a[i];
1643
1644    ==>
1645
1646    sum += a[i]
1647    ....
1648    i = i+1;
1649    sum1 += a[i]
1650    ....
1651    i = i+1
1652    sum2 += a[i];
1653    ....
1654
1655    Return NULL if INSN contains no opportunity for expansion of accumulator.
1656    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1657    information and return a pointer to it.
1658 */
1659
1660 static struct var_to_expand *
1661 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1662 {
1663   rtx set, dest, src;
1664   struct var_to_expand *ves;
1665   unsigned accum_pos;
1666   enum rtx_code code;
1667   int debug_uses = 0;
1668
1669   set = single_set (insn);
1670   if (!set)
1671     return NULL;
1672
1673   dest = SET_DEST (set);
1674   src = SET_SRC (set);
1675   code = GET_CODE (src);
1676
1677   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1678     return NULL;
1679
1680   if (FLOAT_MODE_P (GET_MODE (dest)))
1681     {
1682       if (!flag_associative_math)
1683         return NULL;
1684       /* In the case of FMA, we're also changing the rounding.  */
1685       if (code == FMA && !flag_unsafe_math_optimizations)
1686         return NULL;
1687     }
1688
1689   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1690      in MD.  But if there is no optab to generate the insn, we can not
1691      perform the variable expansion.  This can happen if an MD provides
1692      an insn but not a named pattern to generate it, for example to avoid
1693      producing code that needs additional mode switches like for x87/mmx.
1694
1695      So we check have_insn_for which looks for an optab for the operation
1696      in SRC.  If it doesn't exist, we can't perform the expansion even
1697      though INSN is valid.  */
1698   if (!have_insn_for (code, GET_MODE (src)))
1699     return NULL;
1700
1701   if (!REG_P (dest)
1702       && !(GET_CODE (dest) == SUBREG
1703            && REG_P (SUBREG_REG (dest))))
1704     return NULL;
1705
1706   /* Find the accumulator use within the operation.  */
1707   if (code == FMA)
1708     {
1709       /* We only support accumulation via FMA in the ADD position.  */
1710       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1711         return NULL;
1712       accum_pos = 2;
1713     }
1714   else if (rtx_equal_p (dest, XEXP (src, 0)))
1715     accum_pos = 0;
1716   else if (rtx_equal_p (dest, XEXP (src, 1)))
1717     {
1718       /* The method of expansion that we are using; which includes the
1719          initialization of the expansions with zero and the summation of
1720          the expansions at the end of the computation will yield wrong
1721          results for (x = something - x) thus avoid using it in that case.  */
1722       if (code == MINUS)
1723         return NULL;
1724       accum_pos = 1;
1725     }
1726   else
1727     return NULL;
1728
1729   /* It must not otherwise be used.  */
1730   if (code == FMA)
1731     {
1732       if (rtx_referenced_p (dest, XEXP (src, 0))
1733           || rtx_referenced_p (dest, XEXP (src, 1)))
1734         return NULL;
1735     }
1736   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1737     return NULL;
1738
1739   /* It must be used in exactly one insn.  */
1740   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1741     return NULL;
1742
1743   if (dump_file)
1744     {
1745       fprintf (dump_file, "\n;; Expanding Accumulator ");
1746       print_rtl (dump_file, dest);
1747       fprintf (dump_file, "\n");
1748     }
1749
1750   if (debug_uses)
1751     /* Instead of resetting the debug insns, we could replace each
1752        debug use in the loop with the sum or product of all expanded
1753        accummulators.  Since we'll only know of all expansions at the
1754        end, we'd have to keep track of which vars_to_expand a debug
1755        insn in the loop references, take note of each copy of the
1756        debug insn during unrolling, and when it's all done, compute
1757        the sum or product of each variable and adjust the original
1758        debug insn and each copy thereof.  What a pain!  */
1759     reset_debug_uses_in_loop (loop, dest, debug_uses);
1760
1761   /* Record the accumulator to expand.  */
1762   ves = XNEW (struct var_to_expand);
1763   ves->insn = insn;
1764   ves->reg = copy_rtx (dest);
1765   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1766   ves->next = NULL;
1767   ves->op = GET_CODE (src);
1768   ves->expansion_count = 0;
1769   ves->reuse_expansion = 0;
1770   ves->accum_pos = accum_pos;
1771   return ves;
1772 }
1773
1774 /* Determine whether there is an induction variable in INSN that
1775    we would like to split during unrolling.
1776
1777    I.e. replace
1778
1779    i = i + 1;
1780    ...
1781    i = i + 1;
1782    ...
1783    i = i + 1;
1784    ...
1785
1786    type chains by
1787
1788    i0 = i + 1
1789    ...
1790    i = i0 + 1
1791    ...
1792    i = i0 + 2
1793    ...
1794
1795    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1796    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1797    pointer to it.  */
1798
1799 static struct iv_to_split *
1800 analyze_iv_to_split_insn (rtx insn)
1801 {
1802   rtx set, dest;
1803   struct rtx_iv iv;
1804   struct iv_to_split *ivts;
1805   bool ok;
1806
1807   /* For now we just split the basic induction variables.  Later this may be
1808      extended for example by selecting also addresses of memory references.  */
1809   set = single_set (insn);
1810   if (!set)
1811     return NULL;
1812
1813   dest = SET_DEST (set);
1814   if (!REG_P (dest))
1815     return NULL;
1816
1817   if (!biv_p (insn, dest))
1818     return NULL;
1819
1820   ok = iv_analyze_result (insn, dest, &iv);
1821
1822   /* This used to be an assert under the assumption that if biv_p returns
1823      true that iv_analyze_result must also return true.  However, that
1824      assumption is not strictly correct as evidenced by pr25569.
1825
1826      Returning NULL when iv_analyze_result returns false is safe and
1827      avoids the problems in pr25569 until the iv_analyze_* routines
1828      can be fixed, which is apparently hard and time consuming
1829      according to their author.  */
1830   if (! ok)
1831     return NULL;
1832
1833   if (iv.step == const0_rtx
1834       || iv.mode != iv.extend_mode)
1835     return NULL;
1836
1837   /* Record the insn to split.  */
1838   ivts = XNEW (struct iv_to_split);
1839   ivts->insn = insn;
1840   ivts->base_var = NULL_RTX;
1841   ivts->step = iv.step;
1842   ivts->next = NULL;
1843   ivts->n_loc = 1;
1844   ivts->loc[0] = 1;
1845
1846   return ivts;
1847 }
1848
1849 /* Determines which of insns in LOOP can be optimized.
1850    Return a OPT_INFO struct with the relevant hash tables filled
1851    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1852    is undefined for the return value.  */
1853
1854 static struct opt_info *
1855 analyze_insns_in_loop (struct loop *loop)
1856 {
1857   basic_block *body, bb;
1858   unsigned i;
1859   struct opt_info *opt_info = XCNEW (struct opt_info);
1860   rtx insn;
1861   struct iv_to_split *ivts = NULL;
1862   struct var_to_expand *ves = NULL;
1863   PTR *slot1;
1864   PTR *slot2;
1865   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1866   edge exit;
1867   bool can_apply = false;
1868
1869   iv_analysis_loop_init (loop);
1870
1871   body = get_loop_body (loop);
1872
1873   if (flag_split_ivs_in_unroller)
1874     {
1875       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1876                                               si_info_hash, si_info_eq, free);
1877       opt_info->iv_to_split_head = NULL;
1878       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1879     }
1880
1881   /* Record the loop exit bb and loop preheader before the unrolling.  */
1882   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1883
1884   if (VEC_length (edge, edges) == 1)
1885     {
1886       exit = VEC_index (edge, edges, 0);
1887       if (!(exit->flags & EDGE_COMPLEX))
1888         {
1889           opt_info->loop_exit = split_edge (exit);
1890           can_apply = true;
1891         }
1892     }
1893
1894   if (flag_variable_expansion_in_unroller
1895       && can_apply)
1896     {
1897       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1898                                                         ve_info_hash,
1899                                                         ve_info_eq, free);
1900       opt_info->var_to_expand_head = NULL;
1901       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1902     }
1903
1904   for (i = 0; i < loop->num_nodes; i++)
1905     {
1906       bb = body[i];
1907       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1908         continue;
1909
1910       FOR_BB_INSNS (bb, insn)
1911       {
1912         if (!INSN_P (insn))
1913           continue;
1914
1915         if (opt_info->insns_to_split)
1916           ivts = analyze_iv_to_split_insn (insn);
1917
1918         if (ivts)
1919           {
1920             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1921             gcc_assert (*slot1 == NULL);
1922             *slot1 = ivts;
1923             *opt_info->iv_to_split_tail = ivts;
1924             opt_info->iv_to_split_tail = &ivts->next;
1925             continue;
1926           }
1927
1928         if (opt_info->insns_with_var_to_expand)
1929           ves = analyze_insn_to_expand_var (loop, insn);
1930
1931         if (ves)
1932           {
1933             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1934             gcc_assert (*slot2 == NULL);
1935             *slot2 = ves;
1936             *opt_info->var_to_expand_tail = ves;
1937             opt_info->var_to_expand_tail = &ves->next;
1938           }
1939       }
1940     }
1941
1942   VEC_free (edge, heap, edges);
1943   free (body);
1944   return opt_info;
1945 }
1946
1947 /* Called just before loop duplication.  Records start of duplicated area
1948    to OPT_INFO.  */
1949
1950 static void
1951 opt_info_start_duplication (struct opt_info *opt_info)
1952 {
1953   if (opt_info)
1954     opt_info->first_new_block = last_basic_block;
1955 }
1956
1957 /* Determine the number of iterations between initialization of the base
1958    variable and the current copy (N_COPY).  N_COPIES is the total number
1959    of newly created copies.  UNROLLING is true if we are unrolling
1960    (not peeling) the loop.  */
1961
1962 static unsigned
1963 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1964 {
1965   if (unrolling)
1966     {
1967       /* If we are unrolling, initialization is done in the original loop
1968          body (number 0).  */
1969       return n_copy;
1970     }
1971   else
1972     {
1973       /* If we are peeling, the copy in that the initialization occurs has
1974          number 1.  The original loop (number 0) is the last.  */
1975       if (n_copy)
1976         return n_copy - 1;
1977       else
1978         return n_copies;
1979     }
1980 }
1981
1982 /* Locate in EXPR the expression corresponding to the location recorded
1983    in IVTS, and return a pointer to the RTX for this location.  */
1984
1985 static rtx *
1986 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1987 {
1988   unsigned i;
1989   rtx *ret = &expr;
1990
1991   for (i = 0; i < ivts->n_loc; i++)
1992     ret = &XEXP (*ret, ivts->loc[i]);
1993
1994   return ret;
1995 }
1996
1997 /* Allocate basic variable for the induction variable chain.  */
1998
1999 static void
2000 allocate_basic_variable (struct iv_to_split *ivts)
2001 {
2002   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2003
2004   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2005 }
2006
2007 /* Insert initialization of basic variable of IVTS before INSN, taking
2008    the initial value from INSN.  */
2009
2010 static void
2011 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2012 {
2013   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2014   rtx seq;
2015
2016   start_sequence ();
2017   expr = force_operand (expr, ivts->base_var);
2018   if (expr != ivts->base_var)
2019     emit_move_insn (ivts->base_var, expr);
2020   seq = get_insns ();
2021   end_sequence ();
2022
2023   emit_insn_before (seq, insn);
2024 }
2025
2026 /* Replace the use of induction variable described in IVTS in INSN
2027    by base variable + DELTA * step.  */
2028
2029 static void
2030 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2031 {
2032   rtx expr, *loc, seq, incr, var;
2033   enum machine_mode mode = GET_MODE (ivts->base_var);
2034   rtx src, dest, set;
2035
2036   /* Construct base + DELTA * step.  */
2037   if (!delta)
2038     expr = ivts->base_var;
2039   else
2040     {
2041       incr = simplify_gen_binary (MULT, mode,
2042                                   ivts->step, gen_int_mode (delta, mode));
2043       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2044                                   ivts->base_var, incr);
2045     }
2046
2047   /* Figure out where to do the replacement.  */
2048   loc = get_ivts_expr (single_set (insn), ivts);
2049
2050   /* If we can make the replacement right away, we're done.  */
2051   if (validate_change (insn, loc, expr, 0))
2052     return;
2053
2054   /* Otherwise, force EXPR into a register and try again.  */
2055   start_sequence ();
2056   var = gen_reg_rtx (mode);
2057   expr = force_operand (expr, var);
2058   if (expr != var)
2059     emit_move_insn (var, expr);
2060   seq = get_insns ();
2061   end_sequence ();
2062   emit_insn_before (seq, insn);
2063
2064   if (validate_change (insn, loc, var, 0))
2065     return;
2066
2067   /* The last chance.  Try recreating the assignment in insn
2068      completely from scratch.  */
2069   set = single_set (insn);
2070   gcc_assert (set);
2071
2072   start_sequence ();
2073   *loc = var;
2074   src = copy_rtx (SET_SRC (set));
2075   dest = copy_rtx (SET_DEST (set));
2076   src = force_operand (src, dest);
2077   if (src != dest)
2078     emit_move_insn (dest, src);
2079   seq = get_insns ();
2080   end_sequence ();
2081
2082   emit_insn_before (seq, insn);
2083   delete_insn (insn);
2084 }
2085
2086
2087 /* Return one expansion of the accumulator recorded in struct VE.  */
2088
2089 static rtx
2090 get_expansion (struct var_to_expand *ve)
2091 {
2092   rtx reg;
2093
2094   if (ve->reuse_expansion == 0)
2095     reg = ve->reg;
2096   else
2097     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2098
2099   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2100     ve->reuse_expansion = 0;
2101   else
2102     ve->reuse_expansion++;
2103
2104   return reg;
2105 }
2106
2107
2108 /* Given INSN replace the uses of the accumulator recorded in VE
2109    with a new register.  */
2110
2111 static void
2112 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2113 {
2114   rtx new_reg, set;
2115   bool really_new_expansion = false;
2116
2117   set = single_set (insn);
2118   gcc_assert (set);
2119
2120   /* Generate a new register only if the expansion limit has not been
2121      reached.  Else reuse an already existing expansion.  */
2122   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2123     {
2124       really_new_expansion = true;
2125       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2126     }
2127   else
2128     new_reg = get_expansion (ve);
2129
2130   validate_change (insn, &SET_DEST (set), new_reg, 1);
2131   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2132
2133   if (apply_change_group ())
2134     if (really_new_expansion)
2135       {
2136         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2137         ve->expansion_count++;
2138       }
2139 }
2140
2141 /* Initialize the variable expansions in loop preheader.  PLACE is the
2142    loop-preheader basic block where the initialization of the
2143    expansions should take place.  The expansions are initialized with
2144    (-0) when the operation is plus or minus to honor sign zero.  This
2145    way we can prevent cases where the sign of the final result is
2146    effected by the sign of the expansion.  Here is an example to
2147    demonstrate this:
2148
2149    for (i = 0 ; i < n; i++)
2150      sum += something;
2151
2152    ==>
2153
2154    sum += something
2155    ....
2156    i = i+1;
2157    sum1 += something
2158    ....
2159    i = i+1
2160    sum2 += something;
2161    ....
2162
2163    When SUM is initialized with -zero and SOMETHING is also -zero; the
2164    final result of sum should be -zero thus the expansions sum1 and sum2
2165    should be initialized with -zero as well (otherwise we will get +zero
2166    as the final result).  */
2167
2168 static void
2169 insert_var_expansion_initialization (struct var_to_expand *ve,
2170                                      basic_block place)
2171 {
2172   rtx seq, var, zero_init, insn;
2173   unsigned i;
2174   enum machine_mode mode = GET_MODE (ve->reg);
2175   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2176
2177   if (VEC_length (rtx, ve->var_expansions) == 0)
2178     return;
2179
2180   start_sequence ();
2181   switch (ve->op)
2182     {
2183     case FMA:
2184       /* Note that we only accumulate FMA via the ADD operand.  */
2185     case PLUS:
2186     case MINUS:
2187       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2188         {
2189           if (honor_signed_zero_p)
2190             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2191           else
2192             zero_init = CONST0_RTX (mode);
2193           emit_move_insn (var, zero_init);
2194         }
2195       break;
2196
2197     case MULT:
2198       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2199         {
2200           zero_init = CONST1_RTX (GET_MODE (var));
2201           emit_move_insn (var, zero_init);
2202         }
2203       break;
2204
2205     default:
2206       gcc_unreachable ();
2207     }
2208
2209   seq = get_insns ();
2210   end_sequence ();
2211
2212   insn = BB_HEAD (place);
2213   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2214     insn = NEXT_INSN (insn);
2215
2216   emit_insn_after (seq, insn);
2217 }
2218
2219 /* Combine the variable expansions at the loop exit.  PLACE is the
2220    loop exit basic block where the summation of the expansions should
2221    take place.  */
2222
2223 static void
2224 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2225 {
2226   rtx sum = ve->reg;
2227   rtx expr, seq, var, insn;
2228   unsigned i;
2229
2230   if (VEC_length (rtx, ve->var_expansions) == 0)
2231     return;
2232
2233   start_sequence ();
2234   switch (ve->op)
2235     {
2236     case FMA:
2237       /* Note that we only accumulate FMA via the ADD operand.  */
2238     case PLUS:
2239     case MINUS:
2240       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2241         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2242       break;
2243
2244     case MULT:
2245       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2246         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2247       break;
2248
2249     default:
2250       gcc_unreachable ();
2251     }
2252
2253   expr = force_operand (sum, ve->reg);
2254   if (expr != ve->reg)
2255     emit_move_insn (ve->reg, expr);
2256   seq = get_insns ();
2257   end_sequence ();
2258
2259   insn = BB_HEAD (place);
2260   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2261     insn = NEXT_INSN (insn);
2262
2263   emit_insn_after (seq, insn);
2264 }
2265
2266 /* Apply loop optimizations in loop copies using the
2267    data which gathered during the unrolling.  Structure
2268    OPT_INFO record that data.
2269
2270    UNROLLING is true if we unrolled (not peeled) the loop.
2271    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2272    the loop (as it should happen in complete unrolling, but not in ordinary
2273    peeling of the loop).  */
2274
2275 static void
2276 apply_opt_in_copies (struct opt_info *opt_info,
2277                      unsigned n_copies, bool unrolling,
2278                      bool rewrite_original_loop)
2279 {
2280   unsigned i, delta;
2281   basic_block bb, orig_bb;
2282   rtx insn, orig_insn, next;
2283   struct iv_to_split ivts_templ, *ivts;
2284   struct var_to_expand ve_templ, *ves;
2285
2286   /* Sanity check -- we need to put initialization in the original loop
2287      body.  */
2288   gcc_assert (!unrolling || rewrite_original_loop);
2289
2290   /* Allocate the basic variables (i0).  */
2291   if (opt_info->insns_to_split)
2292     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2293       allocate_basic_variable (ivts);
2294
2295   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2296     {
2297       bb = BASIC_BLOCK (i);
2298       orig_bb = get_bb_original (bb);
2299
2300       /* bb->aux holds position in copy sequence initialized by
2301          duplicate_loop_to_header_edge.  */
2302       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2303                                         unrolling);
2304       bb->aux = 0;
2305       orig_insn = BB_HEAD (orig_bb);
2306       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2307         {
2308           next = NEXT_INSN (insn);
2309           if (!INSN_P (insn)
2310               || (DEBUG_INSN_P (insn)
2311                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2312             continue;
2313
2314           while (!INSN_P (orig_insn)
2315                  || (DEBUG_INSN_P (orig_insn)
2316                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2317                          == LABEL_DECL)))
2318             orig_insn = NEXT_INSN (orig_insn);
2319
2320           ivts_templ.insn = orig_insn;
2321           ve_templ.insn = orig_insn;
2322
2323           /* Apply splitting iv optimization.  */
2324           if (opt_info->insns_to_split)
2325             {
2326               ivts = (struct iv_to_split *)
2327                 htab_find (opt_info->insns_to_split, &ivts_templ);
2328
2329               if (ivts)
2330                 {
2331                   gcc_assert (GET_CODE (PATTERN (insn))
2332                               == GET_CODE (PATTERN (orig_insn)));
2333
2334                   if (!delta)
2335                     insert_base_initialization (ivts, insn);
2336                   split_iv (ivts, insn, delta);
2337                 }
2338             }
2339           /* Apply variable expansion optimization.  */
2340           if (unrolling && opt_info->insns_with_var_to_expand)
2341             {
2342               ves = (struct var_to_expand *)
2343                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2344               if (ves)
2345                 {
2346                   gcc_assert (GET_CODE (PATTERN (insn))
2347                               == GET_CODE (PATTERN (orig_insn)));
2348                   expand_var_during_unrolling (ves, insn);
2349                 }
2350             }
2351           orig_insn = NEXT_INSN (orig_insn);
2352         }
2353     }
2354
2355   if (!rewrite_original_loop)
2356     return;
2357
2358   /* Initialize the variable expansions in the loop preheader
2359      and take care of combining them at the loop exit.  */
2360   if (opt_info->insns_with_var_to_expand)
2361     {
2362       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2363         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2364       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2365         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2366     }
2367
2368   /* Rewrite also the original loop body.  Find them as originals of the blocks
2369      in the last copied iteration, i.e. those that have
2370      get_bb_copy (get_bb_original (bb)) == bb.  */
2371   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2372     {
2373       bb = BASIC_BLOCK (i);
2374       orig_bb = get_bb_original (bb);
2375       if (get_bb_copy (orig_bb) != bb)
2376         continue;
2377
2378       delta = determine_split_iv_delta (0, n_copies, unrolling);
2379       for (orig_insn = BB_HEAD (orig_bb);
2380            orig_insn != NEXT_INSN (BB_END (bb));
2381            orig_insn = next)
2382         {
2383           next = NEXT_INSN (orig_insn);
2384
2385           if (!INSN_P (orig_insn))
2386             continue;
2387
2388           ivts_templ.insn = orig_insn;
2389           if (opt_info->insns_to_split)
2390             {
2391               ivts = (struct iv_to_split *)
2392                 htab_find (opt_info->insns_to_split, &ivts_templ);
2393               if (ivts)
2394                 {
2395                   if (!delta)
2396                     insert_base_initialization (ivts, orig_insn);
2397                   split_iv (ivts, orig_insn, delta);
2398                   continue;
2399                 }
2400             }
2401
2402         }
2403     }
2404 }
2405
2406 /* Release OPT_INFO.  */
2407
2408 static void
2409 free_opt_info (struct opt_info *opt_info)
2410 {
2411   if (opt_info->insns_to_split)
2412     htab_delete (opt_info->insns_to_split);
2413   if (opt_info->insns_with_var_to_expand)
2414     {
2415       struct var_to_expand *ves;
2416
2417       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2418         VEC_free (rtx, heap, ves->var_expansions);
2419       htab_delete (opt_info->insns_with_var_to_expand);
2420     }
2421   free (opt_info);
2422 }