gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103   unsigned accum_pos;              /* The position in which the accumulator is placed in
 104                                       the insn src.  For example in x = x + something
 105                                       accum_pos is 0 while in x = something + x accum_pos
 106                                       is 1.  */
 107 };
 108
 109 /* Information about optimization applied in
 110    the unrolled loop.  */
 111
 112 struct opt_info
 113 {
 114   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 115   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 116   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 117   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 118                                       to expand.  */
 119   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 120   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 121   unsigned first_new_block;        /* The first basic block that was
 122                                       duplicated.  */
 123   basic_block loop_exit;           /* The loop exit basic block.  */
 124   basic_block loop_preheader;      /* The loop preheader basic block.  */
 125 };
 126
 127 static void decide_unrolling_and_peeling (int);
 128 static void peel_loops_completely (int);
 129 static void decide_peel_simple (struct loop *, int);
 130 static void decide_peel_once_rolling (struct loop *, int);
 131 static void decide_peel_completely (struct loop *, int);
 132 static void decide_unroll_stupid (struct loop *, int);
 133 static void decide_unroll_constant_iterations (struct loop *, int);
 134 static void decide_unroll_runtime_iterations (struct loop *, int);
 135 static void peel_loop_simple (struct loop *);
 136 static void peel_loop_completely (struct loop *);
 137 static void unroll_loop_stupid (struct loop *);
 138 static void unroll_loop_constant_iterations (struct loop *);
 139 static void unroll_loop_runtime_iterations (struct loop *);
 140 static struct opt_info *analyze_insns_in_loop (struct loop *);
 141 static void opt_info_start_duplication (struct opt_info *);
 142 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 143 static void free_opt_info (struct opt_info *);
 144 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 145 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx);
 146 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 147 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 148 static void insert_var_expansion_initialization (struct var_to_expand *,
 149                                                  basic_block);
 150 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 151                                              basic_block);
 152 static rtx get_expansion (struct var_to_expand *);
 153
 154 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 155 void
 156 unroll_and_peel_loops (int flags)
 157 {
 158   struct loop *loop;
 159   bool check;
 160   loop_iterator li;
 161
 162   /* First perform complete loop peeling (it is almost surely a win,
 163      and affects parameters for further decision a lot).  */
 164   peel_loops_completely (flags);
 165
 166   /* Now decide rest of unrolling and peeling.  */
 167   decide_unrolling_and_peeling (flags);
 168
 169   /* Scan the loops, inner ones first.  */
 170   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 171     {
 172       check = true;
 173       /* And perform the appropriate transformations.  */
 174       switch (loop->lpt_decision.decision)
 175         {
 176         case LPT_PEEL_COMPLETELY:
 177           /* Already done.  */
 178           gcc_unreachable ();
 179         case LPT_PEEL_SIMPLE:
 180           peel_loop_simple (loop);
 181           break;
 182         case LPT_UNROLL_CONSTANT:
 183           unroll_loop_constant_iterations (loop);
 184           break;
 185         case LPT_UNROLL_RUNTIME:
 186           unroll_loop_runtime_iterations (loop);
 187           break;
 188         case LPT_UNROLL_STUPID:
 189           unroll_loop_stupid (loop);
 190           break;
 191         case LPT_NONE:
 192           check = false;
 193           break;
 194         default:
 195           gcc_unreachable ();
 196         }
 197       if (check)
 198         {
 199 #ifdef ENABLE_CHECKING
 200           verify_dominators (CDI_DOMINATORS);
 201           verify_loop_structure ();
 202 #endif
 203         }
 204     }
 205
 206   iv_analysis_done ();
 207 }
 208
 209 /* Check whether exit of the LOOP is at the end of loop body.  */
 210
 211 static bool
 212 loop_exit_at_end_p (struct loop *loop)
 213 {
 214   struct niter_desc *desc = get_simple_loop_desc (loop);
 215   rtx insn;
 216
 217   if (desc->in_edge->dest != loop->latch)
 218     return false;
 219
 220   /* Check that the latch is empty.  */
 221   FOR_BB_INSNS (loop->latch, insn)
 222     {
 223       if (INSN_P (insn))
 224         return false;
 225     }
 226
 227   return true;
 228 }
 229
 230 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 231 static void
 232 peel_loops_completely (int flags)
 233 {
 234   struct loop *loop;
 235   loop_iterator li;
 236
 237   /* Scan the loops, the inner ones first.  */
 238   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 239     {
 240       loop->lpt_decision.decision = LPT_NONE;
 241
 242       if (dump_file)
 243         fprintf (dump_file,
 244                  "\n;; *** Considering loop %d for complete peeling ***\n",
 245                  loop->num);
 246
 247       loop->ninsns = num_loop_insns (loop);
 248
 249       decide_peel_once_rolling (loop, flags);
 250       if (loop->lpt_decision.decision == LPT_NONE)
 251         decide_peel_completely (loop, flags);
 252
 253       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 254         {
 255           peel_loop_completely (loop);
 256 #ifdef ENABLE_CHECKING
 257           verify_dominators (CDI_DOMINATORS);
 258           verify_loop_structure ();
 259 #endif
 260         }
 261     }
 262 }
 263
 264 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 265 static void
 266 decide_unrolling_and_peeling (int flags)
 267 {
 268   struct loop *loop;
 269   loop_iterator li;
 270
 271   /* Scan the loops, inner ones first.  */
 272   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 273     {
 274       loop->lpt_decision.decision = LPT_NONE;
 275
 276       if (dump_file)
 277         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 278
 279       /* Do not peel cold areas.  */
 280       if (optimize_loop_for_size_p (loop))
 281         {
 282           if (dump_file)
 283             fprintf (dump_file, ";; Not considering loop, cold area\n");
 284           continue;
 285         }
 286
 287       /* Can the loop be manipulated?  */
 288       if (!can_duplicate_loop_p (loop))
 289         {
 290           if (dump_file)
 291             fprintf (dump_file,
 292                      ";; Not considering loop, cannot duplicate\n");
 293           continue;
 294         }
 295
 296       /* Skip non-innermost loops.  */
 297       if (loop->inner)
 298         {
 299           if (dump_file)
 300             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 301           continue;
 302         }
 303
 304       loop->ninsns = num_loop_insns (loop);
 305       loop->av_ninsns = average_num_loop_insns (loop);
 306
 307       /* Try transformations one by one in decreasing order of
 308          priority.  */
 309
 310       decide_unroll_constant_iterations (loop, flags);
 311       if (loop->lpt_decision.decision == LPT_NONE)
 312         decide_unroll_runtime_iterations (loop, flags);
 313       if (loop->lpt_decision.decision == LPT_NONE)
 314         decide_unroll_stupid (loop, flags);
 315       if (loop->lpt_decision.decision == LPT_NONE)
 316         decide_peel_simple (loop, flags);
 317     }
 318 }
 319
 320 /* Decide whether the LOOP is once rolling and suitable for complete
 321    peeling.  */
 322 static void
 323 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 324 {
 325   struct niter_desc *desc;
 326
 327   if (dump_file)
 328     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 329
 330   /* Is the loop small enough?  */
 331   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 332     {
 333       if (dump_file)
 334         fprintf (dump_file, ";; Not considering loop, is too big\n");
 335       return;
 336     }
 337
 338   /* Check for simple loops.  */
 339   desc = get_simple_loop_desc (loop);
 340
 341   /* Check number of iterations.  */
 342   if (!desc->simple_p
 343       || desc->assumptions
 344       || desc->infinite
 345       || !desc->const_iter
 346       || desc->niter != 0)
 347     {
 348       if (dump_file)
 349         fprintf (dump_file,
 350                  ";; Unable to prove that the loop rolls exactly once\n");
 351       return;
 352     }
 353
 354   /* Success.  */
 355   if (dump_file)
 356     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 357   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 358 }
 359
 360 /* Decide whether the LOOP is suitable for complete peeling.  */
 361 static void
 362 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 363 {
 364   unsigned npeel;
 365   struct niter_desc *desc;
 366
 367   if (dump_file)
 368     fprintf (dump_file, "\n;; Considering peeling completely\n");
 369
 370   /* Skip non-innermost loops.  */
 371   if (loop->inner)
 372     {
 373       if (dump_file)
 374         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 375       return;
 376     }
 377
 378   /* Do not peel cold areas.  */
 379   if (optimize_loop_for_size_p (loop))
 380     {
 381       if (dump_file)
 382         fprintf (dump_file, ";; Not considering loop, cold area\n");
 383       return;
 384     }
 385
 386   /* Can the loop be manipulated?  */
 387   if (!can_duplicate_loop_p (loop))
 388     {
 389       if (dump_file)
 390         fprintf (dump_file,
 391                  ";; Not considering loop, cannot duplicate\n");
 392       return;
 393     }
 394
 395   /* npeel = number of iterations to peel.  */
 396   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 397   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 398     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 399
 400   /* Is the loop small enough?  */
 401   if (!npeel)
 402     {
 403       if (dump_file)
 404         fprintf (dump_file, ";; Not considering loop, is too big\n");
 405       return;
 406     }
 407
 408   /* Check for simple loops.  */
 409   desc = get_simple_loop_desc (loop);
 410
 411   /* Check number of iterations.  */
 412   if (!desc->simple_p
 413       || desc->assumptions
 414       || !desc->const_iter
 415       || desc->infinite)
 416     {
 417       if (dump_file)
 418         fprintf (dump_file,
 419                  ";; Unable to prove that the loop iterates constant times\n");
 420       return;
 421     }
 422
 423   if (desc->niter > npeel - 1)
 424     {
 425       if (dump_file)
 426         {
 427           fprintf (dump_file,
 428                    ";; Not peeling loop completely, rolls too much (");
 429           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 430           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 431         }
 432       return;
 433     }
 434
 435   /* Success.  */
 436   if (dump_file)
 437     fprintf (dump_file, ";; Decided to peel loop completely\n");
 438   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 439 }
 440
 441 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 442    completely.  The transformation done:
 443
 444    for (i = 0; i < 4; i++)
 445      body;
 446
 447    ==>
 448
 449    i = 0;
 450    body; i++;
 451    body; i++;
 452    body; i++;
 453    body; i++;
 454    */
 455 static void
 456 peel_loop_completely (struct loop *loop)
 457 {
 458   sbitmap wont_exit;
 459   unsigned HOST_WIDE_INT npeel;
 460   unsigned i;
 461   VEC (edge, heap) *remove_edges;
 462   edge ein;
 463   struct niter_desc *desc = get_simple_loop_desc (loop);
 464   struct opt_info *opt_info = NULL;
 465
 466   npeel = desc->niter;
 467
 468   if (npeel)
 469     {
 470       bool ok;
 471
 472       wont_exit = sbitmap_alloc (npeel + 1);
 473       sbitmap_ones (wont_exit);
 474       RESET_BIT (wont_exit, 0);
 475       if (desc->noloop_assumptions)
 476         RESET_BIT (wont_exit, 1);
 477
 478       remove_edges = NULL;
 479
 480       if (flag_split_ivs_in_unroller)
 481         opt_info = analyze_insns_in_loop (loop);
 482
 483       opt_info_start_duplication (opt_info);
 484       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 485                                           npeel,
 486                                           wont_exit, desc->out_edge,
 487                                           &remove_edges,
 488                                           DLTHE_FLAG_UPDATE_FREQ
 489                                           | DLTHE_FLAG_COMPLETTE_PEEL
 490                                           | (opt_info
 491                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 492       gcc_assert (ok);
 493
 494       free (wont_exit);
 495
 496       if (opt_info)
 497         {
 498           apply_opt_in_copies (opt_info, npeel, false, true);
 499           free_opt_info (opt_info);
 500         }
 501
 502       /* Remove the exit edges.  */
 503       for (i = 0; VEC_iterate (edge, remove_edges, i, ein); i++)
 504         remove_path (ein);
 505       VEC_free (edge, heap, remove_edges);
 506     }
 507
 508   ein = desc->in_edge;
 509   free_simple_loop_desc (loop);
 510
 511   /* Now remove the unreachable part of the last iteration and cancel
 512      the loop.  */
 513   remove_path (ein);
 514
 515   if (dump_file)
 516     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 517 }
 518
 519 /* Decide whether to unroll LOOP iterating constant number of times
 520    and how much.  */
 521
 522 static void
 523 decide_unroll_constant_iterations (struct loop *loop, int flags)
 524 {
 525   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 526   struct niter_desc *desc;
 527
 528   if (!(flags & UAP_UNROLL))
 529     {
 530       /* We were not asked to, just return back silently.  */
 531       return;
 532     }
 533
 534   if (dump_file)
 535     fprintf (dump_file,
 536              "\n;; Considering unrolling loop with constant "
 537              "number of iterations\n");
 538
 539   /* nunroll = total number of copies of the original loop body in
 540      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 541   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 542   nunroll_by_av
 543     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 544   if (nunroll > nunroll_by_av)
 545     nunroll = nunroll_by_av;
 546   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 547     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 548
 549   /* Skip big loops.  */
 550   if (nunroll <= 1)
 551     {
 552       if (dump_file)
 553         fprintf (dump_file, ";; Not considering loop, is too big\n");
 554       return;
 555     }
 556
 557   /* Check for simple loops.  */
 558   desc = get_simple_loop_desc (loop);
 559
 560   /* Check number of iterations.  */
 561   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 562     {
 563       if (dump_file)
 564         fprintf (dump_file,
 565                  ";; Unable to prove that the loop iterates constant times\n");
 566       return;
 567     }
 568
 569   /* Check whether the loop rolls enough to consider.  */
 570   if (desc->niter < 2 * nunroll)
 571     {
 572       if (dump_file)
 573         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 574       return;
 575     }
 576
 577   /* Success; now compute number of iterations to unroll.  We alter
 578      nunroll so that as few as possible copies of loop body are
 579      necessary, while still not decreasing the number of unrollings
 580      too much (at most by 1).  */
 581   best_copies = 2 * nunroll + 10;
 582
 583   i = 2 * nunroll + 2;
 584   if (i - 1 >= desc->niter)
 585     i = desc->niter - 2;
 586
 587   for (; i >= nunroll - 1; i--)
 588     {
 589       unsigned exit_mod = desc->niter % (i + 1);
 590
 591       if (!loop_exit_at_end_p (loop))
 592         n_copies = exit_mod + i + 1;
 593       else if (exit_mod != (unsigned) i
 594                || desc->noloop_assumptions != NULL_RTX)
 595         n_copies = exit_mod + i + 2;
 596       else
 597         n_copies = i + 1;
 598
 599       if (n_copies < best_copies)
 600         {
 601           best_copies = n_copies;
 602           best_unroll = i;
 603         }
 604     }
 605
 606   if (dump_file)
 607     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 608              best_unroll + 1, best_copies, nunroll);
 609
 610   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 611   loop->lpt_decision.times = best_unroll;
 612
 613   if (dump_file)
 614     fprintf (dump_file,
 615              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 616              loop->lpt_decision.times);
 617 }
 618
 619 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 620    times.  The transformation does this:
 621
 622    for (i = 0; i < 102; i++)
 623      body;
 624
 625    ==>
 626
 627    i = 0;
 628    body; i++;
 629    body; i++;
 630    while (i < 102)
 631      {
 632        body; i++;
 633        body; i++;
 634        body; i++;
 635        body; i++;
 636      }
 637   */
 638 static void
 639 unroll_loop_constant_iterations (struct loop *loop)
 640 {
 641   unsigned HOST_WIDE_INT niter;
 642   unsigned exit_mod;
 643   sbitmap wont_exit;
 644   unsigned i;
 645   VEC (edge, heap) *remove_edges;
 646   edge e;
 647   unsigned max_unroll = loop->lpt_decision.times;
 648   struct niter_desc *desc = get_simple_loop_desc (loop);
 649   bool exit_at_end = loop_exit_at_end_p (loop);
 650   struct opt_info *opt_info = NULL;
 651   bool ok;
 652
 653   niter = desc->niter;
 654
 655   /* Should not get here (such loop should be peeled instead).  */
 656   gcc_assert (niter > max_unroll + 1);
 657
 658   exit_mod = niter % (max_unroll + 1);
 659
 660   wont_exit = sbitmap_alloc (max_unroll + 1);
 661   sbitmap_ones (wont_exit);
 662
 663   remove_edges = NULL;
 664   if (flag_split_ivs_in_unroller
 665       || flag_variable_expansion_in_unroller)
 666     opt_info = analyze_insns_in_loop (loop);
 667
 668   if (!exit_at_end)
 669     {
 670       /* The exit is not at the end of the loop; leave exit test
 671          in the first copy, so that the loops that start with test
 672          of exit condition have continuous body after unrolling.  */
 673
 674       if (dump_file)
 675         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 676
 677       /* Peel exit_mod iterations.  */
 678       RESET_BIT (wont_exit, 0);
 679       if (desc->noloop_assumptions)
 680         RESET_BIT (wont_exit, 1);
 681
 682       if (exit_mod)
 683         {
 684           opt_info_start_duplication (opt_info);
 685           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 686                                               exit_mod,
 687                                               wont_exit, desc->out_edge,
 688                                               &remove_edges,
 689                                               DLTHE_FLAG_UPDATE_FREQ
 690                                               | (opt_info && exit_mod > 1
 691                                                  ? DLTHE_RECORD_COPY_NUMBER
 692                                                    : 0));
 693           gcc_assert (ok);
 694
 695           if (opt_info && exit_mod > 1)
 696             apply_opt_in_copies (opt_info, exit_mod, false, false);
 697
 698           desc->noloop_assumptions = NULL_RTX;
 699           desc->niter -= exit_mod;
 700           desc->niter_max -= exit_mod;
 701         }
 702
 703       SET_BIT (wont_exit, 1);
 704     }
 705   else
 706     {
 707       /* Leave exit test in last copy, for the same reason as above if
 708          the loop tests the condition at the end of loop body.  */
 709
 710       if (dump_file)
 711         fprintf (dump_file, ";; Condition on end of loop.\n");
 712
 713       /* We know that niter >= max_unroll + 2; so we do not need to care of
 714          case when we would exit before reaching the loop.  So just peel
 715          exit_mod + 1 iterations.  */
 716       if (exit_mod != max_unroll
 717           || desc->noloop_assumptions)
 718         {
 719           RESET_BIT (wont_exit, 0);
 720           if (desc->noloop_assumptions)
 721             RESET_BIT (wont_exit, 1);
 722
 723           opt_info_start_duplication (opt_info);
 724           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 725                                               exit_mod + 1,
 726                                               wont_exit, desc->out_edge,
 727                                               &remove_edges,
 728                                               DLTHE_FLAG_UPDATE_FREQ
 729                                               | (opt_info && exit_mod > 0
 730                                                  ? DLTHE_RECORD_COPY_NUMBER
 731                                                    : 0));
 732           gcc_assert (ok);
 733
 734           if (opt_info && exit_mod > 0)
 735             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 736
 737           desc->niter -= exit_mod + 1;
 738           desc->niter_max -= exit_mod + 1;
 739           desc->noloop_assumptions = NULL_RTX;
 740
 741           SET_BIT (wont_exit, 0);
 742           SET_BIT (wont_exit, 1);
 743         }
 744
 745       RESET_BIT (wont_exit, max_unroll);
 746     }
 747
 748   /* Now unroll the loop.  */
 749
 750   opt_info_start_duplication (opt_info);
 751   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 752                                       max_unroll,
 753                                       wont_exit, desc->out_edge,
 754                                       &remove_edges,
 755                                       DLTHE_FLAG_UPDATE_FREQ
 756                                       | (opt_info
 757                                          ? DLTHE_RECORD_COPY_NUMBER
 758                                            : 0));
 759   gcc_assert (ok);
 760
 761   if (opt_info)
 762     {
 763       apply_opt_in_copies (opt_info, max_unroll, true, true);
 764       free_opt_info (opt_info);
 765     }
 766
 767   free (wont_exit);
 768
 769   if (exit_at_end)
 770     {
 771       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 772       /* Find a new in and out edge; they are in the last copy we have made.  */
 773
 774       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 775         {
 776           desc->out_edge = EDGE_SUCC (exit_block, 0);
 777           desc->in_edge = EDGE_SUCC (exit_block, 1);
 778         }
 779       else
 780         {
 781           desc->out_edge = EDGE_SUCC (exit_block, 1);
 782           desc->in_edge = EDGE_SUCC (exit_block, 0);
 783         }
 784     }
 785
 786   desc->niter /= max_unroll + 1;
 787   desc->niter_max /= max_unroll + 1;
 788   desc->niter_expr = GEN_INT (desc->niter);
 789
 790   /* Remove the edges.  */
 791   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
 792     remove_path (e);
 793   VEC_free (edge, heap, remove_edges);
 794
 795   if (dump_file)
 796     fprintf (dump_file,
 797              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 798              max_unroll, num_loop_insns (loop));
 799 }
 800
 801 /* Decide whether to unroll LOOP iterating runtime computable number of times
 802    and how much.  */
 803 static void
 804 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 805 {
 806   unsigned nunroll, nunroll_by_av, i;
 807   struct niter_desc *desc;
 808
 809   if (!(flags & UAP_UNROLL))
 810     {
 811       /* We were not asked to, just return back silently.  */
 812       return;
 813     }
 814
 815   if (dump_file)
 816     fprintf (dump_file,
 817              "\n;; Considering unrolling loop with runtime "
 818              "computable number of iterations\n");
 819
 820   /* nunroll = total number of copies of the original loop body in
 821      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 822   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 823   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 824   if (nunroll > nunroll_by_av)
 825     nunroll = nunroll_by_av;
 826   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 827     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 828
 829   /* Skip big loops.  */
 830   if (nunroll <= 1)
 831     {
 832       if (dump_file)
 833         fprintf (dump_file, ";; Not considering loop, is too big\n");
 834       return;
 835     }
 836
 837   /* Check for simple loops.  */
 838   desc = get_simple_loop_desc (loop);
 839
 840   /* Check simpleness.  */
 841   if (!desc->simple_p || desc->assumptions)
 842     {
 843       if (dump_file)
 844         fprintf (dump_file,
 845                  ";; Unable to prove that the number of iterations "
 846                  "can be counted in runtime\n");
 847       return;
 848     }
 849
 850   if (desc->const_iter)
 851     {
 852       if (dump_file)
 853         fprintf (dump_file, ";; Loop iterates constant times\n");
 854       return;
 855     }
 856
 857   /* If we have profile feedback, check whether the loop rolls.  */
 858   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 859     {
 860       if (dump_file)
 861         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 862       return;
 863     }
 864
 865   /* Success; now force nunroll to be power of 2, as we are unable to
 866      cope with overflows in computation of number of iterations.  */
 867   for (i = 1; 2 * i <= nunroll; i *= 2)
 868     continue;
 869
 870   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 871   loop->lpt_decision.times = i - 1;
 872
 873   if (dump_file)
 874     fprintf (dump_file,
 875              ";; Decided to unroll the runtime computable "
 876              "times rolling loop, %d times.\n",
 877              loop->lpt_decision.times);
 878 }
 879
 880 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 881    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 882    and NULL is returned instead.  */
 883
 884 basic_block
 885 split_edge_and_insert (edge e, rtx insns)
 886 {
 887   basic_block bb;
 888
 889   if (!insns)
 890     return NULL;
 891   bb = split_edge (e);
 892   emit_insn_after (insns, BB_END (bb));
 893
 894   /* ??? We used to assume that INSNS can contain control flow insns, and
 895      that we had to try to find sub basic blocks in BB to maintain a valid
 896      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 897      and call break_superblocks when going out of cfglayout mode.  But it
 898      turns out that this never happens; and that if it does ever happen,
 899      the verify_flow_info call in loop_optimizer_finalize would fail.
 900
 901      There are two reasons why we expected we could have control flow insns
 902      in INSNS.  The first is when a comparison has to be done in parts, and
 903      the second is when the number of iterations is computed for loops with
 904      the number of iterations known at runtime.  In both cases, test cases
 905      to get control flow in INSNS appear to be impossible to construct:
 906
 907       * If do_compare_rtx_and_jump needs several branches to do comparison
 908         in a mode that needs comparison by parts, we cannot analyze the
 909         number of iterations of the loop, and we never get to unrolling it.
 910
 911       * The code in expand_divmod that was suspected to cause creation of
 912         branching code seems to be only accessed for signed division.  The
 913         divisions used by # of iterations analysis are always unsigned.
 914         Problems might arise on architectures that emits branching code
 915         for some operations that may appear in the unroller (especially
 916         for division), but we have no such architectures.
 917
 918      Considering all this, it was decided that we should for now assume
 919      that INSNS can in theory contain control flow insns, but in practice
 920      it never does.  So we don't handle the theoretical case, and should
 921      a real failure ever show up, we have a pretty good clue for how to
 922      fix it.  */
 923
 924   return bb;
 925 }
 926
 927 /* Unroll LOOP for that we are able to count number of iterations in runtime
 928    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 929    extra care for case n < 0):
 930
 931    for (i = 0; i < n; i++)
 932      body;
 933
 934    ==>
 935
 936    i = 0;
 937    mod = n % 4;
 938
 939    switch (mod)
 940      {
 941        case 3:
 942          body; i++;
 943        case 2:
 944          body; i++;
 945        case 1:
 946          body; i++;
 947        case 0: ;
 948      }
 949
 950    while (i < n)
 951      {
 952        body; i++;
 953        body; i++;
 954        body; i++;
 955        body; i++;
 956      }
 957    */
 958 static void
 959 unroll_loop_runtime_iterations (struct loop *loop)
 960 {
 961   rtx old_niter, niter, init_code, branch_code, tmp;
 962   unsigned i, j, p;
 963   basic_block preheader, *body, swtch, ezc_swtch;
 964   VEC (basic_block, heap) *dom_bbs;
 965   sbitmap wont_exit;
 966   int may_exit_copy;
 967   unsigned n_peel;
 968   VEC (edge, heap) *remove_edges;
 969   edge e;
 970   bool extra_zero_check, last_may_exit;
 971   unsigned max_unroll = loop->lpt_decision.times;
 972   struct niter_desc *desc = get_simple_loop_desc (loop);
 973   bool exit_at_end = loop_exit_at_end_p (loop);
 974   struct opt_info *opt_info = NULL;
 975   bool ok;
 976
 977   if (flag_split_ivs_in_unroller
 978       || flag_variable_expansion_in_unroller)
 979     opt_info = analyze_insns_in_loop (loop);
 980
 981   /* Remember blocks whose dominators will have to be updated.  */
 982   dom_bbs = NULL;
 983
 984   body = get_loop_body (loop);
 985   for (i = 0; i < loop->num_nodes; i++)
 986     {
 987       VEC (basic_block, heap) *ldom;
 988       basic_block bb;
 989
 990       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 991       for (j = 0; VEC_iterate (basic_block, ldom, j, bb); j++)
 992         if (!flow_bb_inside_loop_p (loop, bb))
 993           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 994
 995       VEC_free (basic_block, heap, ldom);
 996     }
 997   free (body);
 998
 999   if (!exit_at_end)
1000     {
1001       /* Leave exit in first copy (for explanation why see comment in
1002          unroll_loop_constant_iterations).  */
1003       may_exit_copy = 0;
1004       n_peel = max_unroll - 1;
1005       extra_zero_check = true;
1006       last_may_exit = false;
1007     }
1008   else
1009     {
1010       /* Leave exit in last copy (for explanation why see comment in
1011          unroll_loop_constant_iterations).  */
1012       may_exit_copy = max_unroll;
1013       n_peel = max_unroll;
1014       extra_zero_check = false;
1015       last_may_exit = true;
1016     }
1017
1018   /* Get expression for number of iterations.  */
1019   start_sequence ();
1020   old_niter = niter = gen_reg_rtx (desc->mode);
1021   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1022   if (tmp != niter)
1023     emit_move_insn (niter, tmp);
1024
1025   /* Count modulo by ANDing it with max_unroll; we use the fact that
1026      the number of unrollings is a power of two, and thus this is correct
1027      even if there is overflow in the computation.  */
1028   niter = expand_simple_binop (desc->mode, AND,
1029                                niter,
1030                                GEN_INT (max_unroll),
1031                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1032
1033   init_code = get_insns ();
1034   end_sequence ();
1035   unshare_all_rtl_in_chain (init_code);
1036
1037   /* Precondition the loop.  */
1038   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1039
1040   remove_edges = NULL;
1041
1042   wont_exit = sbitmap_alloc (max_unroll + 2);
1043
1044   /* Peel the first copy of loop body (almost always we must leave exit test
1045      here; the only exception is when we have extra zero check and the number
1046      of iterations is reliable.  Also record the place of (possible) extra
1047      zero check.  */
1048   sbitmap_zero (wont_exit);
1049   if (extra_zero_check
1050       && !desc->noloop_assumptions)
1051     SET_BIT (wont_exit, 1);
1052   ezc_swtch = loop_preheader_edge (loop)->src;
1053   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1054                                       1, wont_exit, desc->out_edge,
1055                                       &remove_edges,
1056                                       DLTHE_FLAG_UPDATE_FREQ);
1057   gcc_assert (ok);
1058
1059   /* Record the place where switch will be built for preconditioning.  */
1060   swtch = split_edge (loop_preheader_edge (loop));
1061
1062   for (i = 0; i < n_peel; i++)
1063     {
1064       /* Peel the copy.  */
1065       sbitmap_zero (wont_exit);
1066       if (i != n_peel - 1 || !last_may_exit)
1067         SET_BIT (wont_exit, 1);
1068       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1069                                           1, wont_exit, desc->out_edge,
1070                                           &remove_edges,
1071                                           DLTHE_FLAG_UPDATE_FREQ);
1072       gcc_assert (ok);
1073
1074       /* Create item for switch.  */
1075       j = n_peel - i - (extra_zero_check ? 0 : 1);
1076       p = REG_BR_PROB_BASE / (i + 2);
1077
1078       preheader = split_edge (loop_preheader_edge (loop));
1079       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1080                                           block_label (preheader), p,
1081                                           NULL_RTX);
1082
1083       /* We rely on the fact that the compare and jump cannot be optimized out,
1084          and hence the cfg we create is correct.  */
1085       gcc_assert (branch_code != NULL_RTX);
1086
1087       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1088       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1089       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1090       e = make_edge (swtch, preheader,
1091                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1092       e->probability = p;
1093     }
1094
1095   if (extra_zero_check)
1096     {
1097       /* Add branch for zero iterations.  */
1098       p = REG_BR_PROB_BASE / (max_unroll + 1);
1099       swtch = ezc_swtch;
1100       preheader = split_edge (loop_preheader_edge (loop));
1101       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1102                                           block_label (preheader), p,
1103                                           NULL_RTX);
1104       gcc_assert (branch_code != NULL_RTX);
1105
1106       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1107       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1108       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1109       e = make_edge (swtch, preheader,
1110                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1111       e->probability = p;
1112     }
1113
1114   /* Recount dominators for outer blocks.  */
1115   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1116
1117   /* And unroll loop.  */
1118
1119   sbitmap_ones (wont_exit);
1120   RESET_BIT (wont_exit, may_exit_copy);
1121   opt_info_start_duplication (opt_info);
1122
1123   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1124                                       max_unroll,
1125                                       wont_exit, desc->out_edge,
1126                                       &remove_edges,
1127                                       DLTHE_FLAG_UPDATE_FREQ
1128                                       | (opt_info
1129                                          ? DLTHE_RECORD_COPY_NUMBER
1130                                            : 0));
1131   gcc_assert (ok);
1132
1133   if (opt_info)
1134     {
1135       apply_opt_in_copies (opt_info, max_unroll, true, true);
1136       free_opt_info (opt_info);
1137     }
1138
1139   free (wont_exit);
1140
1141   if (exit_at_end)
1142     {
1143       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1144       /* Find a new in and out edge; they are in the last copy we have
1145          made.  */
1146
1147       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1148         {
1149           desc->out_edge = EDGE_SUCC (exit_block, 0);
1150           desc->in_edge = EDGE_SUCC (exit_block, 1);
1151         }
1152       else
1153         {
1154           desc->out_edge = EDGE_SUCC (exit_block, 1);
1155           desc->in_edge = EDGE_SUCC (exit_block, 0);
1156         }
1157     }
1158
1159   /* Remove the edges.  */
1160   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
1161     remove_path (e);
1162   VEC_free (edge, heap, remove_edges);
1163
1164   /* We must be careful when updating the number of iterations due to
1165      preconditioning and the fact that the value must be valid at entry
1166      of the loop.  After passing through the above code, we see that
1167      the correct new number of iterations is this:  */
1168   gcc_assert (!desc->const_iter);
1169   desc->niter_expr =
1170     simplify_gen_binary (UDIV, desc->mode, old_niter,
1171                          GEN_INT (max_unroll + 1));
1172   desc->niter_max /= max_unroll + 1;
1173   if (exit_at_end)
1174     {
1175       desc->niter_expr =
1176         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1177       desc->noloop_assumptions = NULL_RTX;
1178       desc->niter_max--;
1179     }
1180
1181   if (dump_file)
1182     fprintf (dump_file,
1183              ";; Unrolled loop %d times, counting # of iterations "
1184              "in runtime, %i insns\n",
1185              max_unroll, num_loop_insns (loop));
1186
1187   VEC_free (basic_block, heap, dom_bbs);
1188 }
1189
1190 /* Decide whether to simply peel LOOP and how much.  */
1191 static void
1192 decide_peel_simple (struct loop *loop, int flags)
1193 {
1194   unsigned npeel;
1195   struct niter_desc *desc;
1196
1197   if (!(flags & UAP_PEEL))
1198     {
1199       /* We were not asked to, just return back silently.  */
1200       return;
1201     }
1202
1203   if (dump_file)
1204     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1205
1206   /* npeel = number of iterations to peel.  */
1207   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1208   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1209     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1210
1211   /* Skip big loops.  */
1212   if (!npeel)
1213     {
1214       if (dump_file)
1215         fprintf (dump_file, ";; Not considering loop, is too big\n");
1216       return;
1217     }
1218
1219   /* Check for simple loops.  */
1220   desc = get_simple_loop_desc (loop);
1221
1222   /* Check number of iterations.  */
1223   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1224     {
1225       if (dump_file)
1226         fprintf (dump_file, ";; Loop iterates constant times\n");
1227       return;
1228     }
1229
1230   /* Do not simply peel loops with branches inside -- it increases number
1231      of mispredicts.  */
1232   if (num_loop_branches (loop) > 1)
1233     {
1234       if (dump_file)
1235         fprintf (dump_file, ";; Not peeling, contains branches\n");
1236       return;
1237     }
1238
1239   if (loop->header->count)
1240     {
1241       unsigned niter = expected_loop_iterations (loop);
1242       if (niter + 1 > npeel)
1243         {
1244           if (dump_file)
1245             {
1246               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1247               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1248                        (HOST_WIDEST_INT) (niter + 1));
1249               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1250                        npeel);
1251             }
1252           return;
1253         }
1254       npeel = niter + 1;
1255     }
1256   else
1257     {
1258       /* For now we have no good heuristics to decide whether loop peeling
1259          will be effective, so disable it.  */
1260       if (dump_file)
1261         fprintf (dump_file,
1262                  ";; Not peeling loop, no evidence it will be profitable\n");
1263       return;
1264     }
1265
1266   /* Success.  */
1267   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1268   loop->lpt_decision.times = npeel;
1269
1270   if (dump_file)
1271     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1272              loop->lpt_decision.times);
1273 }
1274
1275 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1276    while (cond)
1277      body;
1278
1279    ==>
1280
1281    if (!cond) goto end;
1282    body;
1283    if (!cond) goto end;
1284    body;
1285    while (cond)
1286      body;
1287    end: ;
1288    */
1289 static void
1290 peel_loop_simple (struct loop *loop)
1291 {
1292   sbitmap wont_exit;
1293   unsigned npeel = loop->lpt_decision.times;
1294   struct niter_desc *desc = get_simple_loop_desc (loop);
1295   struct opt_info *opt_info = NULL;
1296   bool ok;
1297
1298   if (flag_split_ivs_in_unroller && npeel > 1)
1299     opt_info = analyze_insns_in_loop (loop);
1300
1301   wont_exit = sbitmap_alloc (npeel + 1);
1302   sbitmap_zero (wont_exit);
1303
1304   opt_info_start_duplication (opt_info);
1305
1306   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1307                                       npeel, wont_exit, NULL,
1308                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1309                                       | (opt_info
1310                                          ? DLTHE_RECORD_COPY_NUMBER
1311                                            : 0));
1312   gcc_assert (ok);
1313
1314   free (wont_exit);
1315
1316   if (opt_info)
1317     {
1318       apply_opt_in_copies (opt_info, npeel, false, false);
1319       free_opt_info (opt_info);
1320     }
1321
1322   if (desc->simple_p)
1323     {
1324       if (desc->const_iter)
1325         {
1326           desc->niter -= npeel;
1327           desc->niter_expr = GEN_INT (desc->niter);
1328           desc->noloop_assumptions = NULL_RTX;
1329         }
1330       else
1331         {
1332           /* We cannot just update niter_expr, as its value might be clobbered
1333              inside loop.  We could handle this by counting the number into
1334              temporary just like we do in runtime unrolling, but it does not
1335              seem worthwhile.  */
1336           free_simple_loop_desc (loop);
1337         }
1338     }
1339   if (dump_file)
1340     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1341 }
1342
1343 /* Decide whether to unroll LOOP stupidly and how much.  */
1344 static void
1345 decide_unroll_stupid (struct loop *loop, int flags)
1346 {
1347   unsigned nunroll, nunroll_by_av, i;
1348   struct niter_desc *desc;
1349
1350   if (!(flags & UAP_UNROLL_ALL))
1351     {
1352       /* We were not asked to, just return back silently.  */
1353       return;
1354     }
1355
1356   if (dump_file)
1357     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1358
1359   /* nunroll = total number of copies of the original loop body in
1360      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1361   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1362   nunroll_by_av
1363     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1364   if (nunroll > nunroll_by_av)
1365     nunroll = nunroll_by_av;
1366   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1367     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1368
1369   /* Skip big loops.  */
1370   if (nunroll <= 1)
1371     {
1372       if (dump_file)
1373         fprintf (dump_file, ";; Not considering loop, is too big\n");
1374       return;
1375     }
1376
1377   /* Check for simple loops.  */
1378   desc = get_simple_loop_desc (loop);
1379
1380   /* Check simpleness.  */
1381   if (desc->simple_p && !desc->assumptions)
1382     {
1383       if (dump_file)
1384         fprintf (dump_file, ";; The loop is simple\n");
1385       return;
1386     }
1387
1388   /* Do not unroll loops with branches inside -- it increases number
1389      of mispredicts.  */
1390   if (num_loop_branches (loop) > 1)
1391     {
1392       if (dump_file)
1393         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1394       return;
1395     }
1396
1397   /* If we have profile feedback, check whether the loop rolls.  */
1398   if (loop->header->count
1399       && expected_loop_iterations (loop) < 2 * nunroll)
1400     {
1401       if (dump_file)
1402         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1403       return;
1404     }
1405
1406   /* Success.  Now force nunroll to be power of 2, as it seems that this
1407      improves results (partially because of better alignments, partially
1408      because of some dark magic).  */
1409   for (i = 1; 2 * i <= nunroll; i *= 2)
1410     continue;
1411
1412   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1413   loop->lpt_decision.times = i - 1;
1414
1415   if (dump_file)
1416     fprintf (dump_file,
1417              ";; Decided to unroll the loop stupidly, %d times.\n",
1418              loop->lpt_decision.times);
1419 }
1420
1421 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1422    while (cond)
1423      body;
1424
1425    ==>
1426
1427    while (cond)
1428      {
1429        body;
1430        if (!cond) break;
1431        body;
1432        if (!cond) break;
1433        body;
1434        if (!cond) break;
1435        body;
1436      }
1437    */
1438 static void
1439 unroll_loop_stupid (struct loop *loop)
1440 {
1441   sbitmap wont_exit;
1442   unsigned nunroll = loop->lpt_decision.times;
1443   struct niter_desc *desc = get_simple_loop_desc (loop);
1444   struct opt_info *opt_info = NULL;
1445   bool ok;
1446
1447   if (flag_split_ivs_in_unroller
1448       || flag_variable_expansion_in_unroller)
1449     opt_info = analyze_insns_in_loop (loop);
1450
1451
1452   wont_exit = sbitmap_alloc (nunroll + 1);
1453   sbitmap_zero (wont_exit);
1454   opt_info_start_duplication (opt_info);
1455
1456   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1457                                       nunroll, wont_exit,
1458                                       NULL, NULL,
1459                                       DLTHE_FLAG_UPDATE_FREQ
1460                                       | (opt_info
1461                                          ? DLTHE_RECORD_COPY_NUMBER
1462                                            : 0));
1463   gcc_assert (ok);
1464
1465   if (opt_info)
1466     {
1467       apply_opt_in_copies (opt_info, nunroll, true, true);
1468       free_opt_info (opt_info);
1469     }
1470
1471   free (wont_exit);
1472
1473   if (desc->simple_p)
1474     {
1475       /* We indeed may get here provided that there are nontrivial assumptions
1476          for a loop to be really simple.  We could update the counts, but the
1477          problem is that we are unable to decide which exit will be taken
1478          (not really true in case the number of iterations is constant,
1479          but noone will do anything with this information, so we do not
1480          worry about it).  */
1481       desc->simple_p = false;
1482     }
1483
1484   if (dump_file)
1485     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1486              nunroll, num_loop_insns (loop));
1487 }
1488
1489 /* A hash function for information about insns to split.  */
1490
1491 static hashval_t
1492 si_info_hash (const void *ivts)
1493 {
1494   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1495 }
1496
1497 /* An equality functions for information about insns to split.  */
1498
1499 static int
1500 si_info_eq (const void *ivts1, const void *ivts2)
1501 {
1502   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1503   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1504
1505   return i1->insn == i2->insn;
1506 }
1507
1508 /* Return a hash for VES, which is really a "var_to_expand *".  */
1509
1510 static hashval_t
1511 ve_info_hash (const void *ves)
1512 {
1513   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1514 }
1515
1516 /* Return true if IVTS1 and IVTS2 (which are really both of type
1517    "var_to_expand *") refer to the same instruction.  */
1518
1519 static int
1520 ve_info_eq (const void *ivts1, const void *ivts2)
1521 {
1522   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1523   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1524
1525   return i1->insn == i2->insn;
1526 }
1527
1528 /* Returns true if REG is referenced in one insn in LOOP.  */
1529
1530 bool
1531 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg)
1532 {
1533   basic_block *body, bb;
1534   unsigned i;
1535   int count_ref = 0;
1536   rtx insn;
1537
1538   body = get_loop_body (loop);
1539   for (i = 0; i < loop->num_nodes; i++)
1540     {
1541       bb = body[i];
1542
1543       FOR_BB_INSNS (bb, insn)
1544       {
1545         if (rtx_referenced_p (reg, insn))
1546           count_ref++;
1547       }
1548     }
1549   return (count_ref  == 1);
1550 }
1551
1552 /* Determine whether INSN contains an accumulator
1553    which can be expanded into separate copies,
1554    one for each copy of the LOOP body.
1555
1556    for (i = 0 ; i < n; i++)
1557      sum += a[i];
1558
1559    ==>
1560
1561    sum += a[i]
1562    ....
1563    i = i+1;
1564    sum1 += a[i]
1565    ....
1566    i = i+1
1567    sum2 += a[i];
1568    ....
1569
1570    Return NULL if INSN contains no opportunity for expansion of accumulator.
1571    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1572    information and return a pointer to it.
1573 */
1574
1575 static struct var_to_expand *
1576 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1577 {
1578   rtx set, dest, src, op1, op2, something;
1579   struct var_to_expand *ves;
1580   enum machine_mode mode1, mode2;
1581   unsigned accum_pos;
1582
1583   set = single_set (insn);
1584   if (!set)
1585     return NULL;
1586
1587   dest = SET_DEST (set);
1588   src = SET_SRC (set);
1589
1590   if (GET_CODE (src) != PLUS
1591       && GET_CODE (src) != MINUS
1592       && GET_CODE (src) != MULT)
1593     return NULL;
1594
1595   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1596      in MD.  But if there is no optab to generate the insn, we can not
1597      perform the variable expansion.  This can happen if an MD provides
1598      an insn but not a named pattern to generate it, for example to avoid
1599      producing code that needs additional mode switches like for x87/mmx.
1600
1601      So we check have_insn_for which looks for an optab for the operation
1602      in SRC.  If it doesn't exist, we can't perform the expansion even
1603      though INSN is valid.  */
1604   if (!have_insn_for (GET_CODE (src), GET_MODE (src)))
1605     return NULL;
1606
1607   op1 = XEXP (src, 0);
1608   op2 = XEXP (src, 1);
1609
1610   if (!REG_P (dest)
1611       && !(GET_CODE (dest) == SUBREG
1612            && REG_P (SUBREG_REG (dest))))
1613     return NULL;
1614
1615   if (rtx_equal_p (dest, op1))
1616     accum_pos = 0;
1617   else if (rtx_equal_p (dest, op2))
1618     accum_pos = 1;
1619   else
1620     return NULL;
1621
1622   /* The method of expansion that we are using; which includes
1623      the initialization of the expansions with zero and the summation of
1624      the expansions at the end of the computation will yield wrong results
1625      for (x = something - x) thus avoid using it in that case.  */
1626   if (accum_pos == 1
1627     && GET_CODE (src) == MINUS)
1628    return NULL;
1629
1630   something = (accum_pos == 0)? op2 : op1;
1631
1632   if (!referenced_in_one_insn_in_loop_p (loop, dest))
1633     return NULL;
1634
1635   if (rtx_referenced_p (dest, something))
1636     return NULL;
1637
1638   mode1 = GET_MODE (dest);
1639   mode2 = GET_MODE (something);
1640   if ((FLOAT_MODE_P (mode1)
1641        || FLOAT_MODE_P (mode2))
1642       && !flag_associative_math)
1643     return NULL;
1644
1645   if (dump_file)
1646   {
1647     fprintf (dump_file,
1648     "\n;; Expanding Accumulator ");
1649     print_rtl (dump_file, dest);
1650     fprintf (dump_file, "\n");
1651   }
1652
1653   /* Record the accumulator to expand.  */
1654   ves = XNEW (struct var_to_expand);
1655   ves->insn = insn;
1656   ves->reg = copy_rtx (dest);
1657   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1658   ves->next = NULL;
1659   ves->op = GET_CODE (src);
1660   ves->expansion_count = 0;
1661   ves->reuse_expansion = 0;
1662   ves->accum_pos = accum_pos;
1663   return ves;
1664 }
1665
1666 /* Determine whether there is an induction variable in INSN that
1667    we would like to split during unrolling.
1668
1669    I.e. replace
1670
1671    i = i + 1;
1672    ...
1673    i = i + 1;
1674    ...
1675    i = i + 1;
1676    ...
1677
1678    type chains by
1679
1680    i0 = i + 1
1681    ...
1682    i = i0 + 1
1683    ...
1684    i = i0 + 2
1685    ...
1686
1687    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1688    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1689    pointer to it.  */
1690
1691 static struct iv_to_split *
1692 analyze_iv_to_split_insn (rtx insn)
1693 {
1694   rtx set, dest;
1695   struct rtx_iv iv;
1696   struct iv_to_split *ivts;
1697   bool ok;
1698
1699   /* For now we just split the basic induction variables.  Later this may be
1700      extended for example by selecting also addresses of memory references.  */
1701   set = single_set (insn);
1702   if (!set)
1703     return NULL;
1704
1705   dest = SET_DEST (set);
1706   if (!REG_P (dest))
1707     return NULL;
1708
1709   if (!biv_p (insn, dest))
1710     return NULL;
1711
1712   ok = iv_analyze_result (insn, dest, &iv);
1713
1714   /* This used to be an assert under the assumption that if biv_p returns
1715      true that iv_analyze_result must also return true.  However, that
1716      assumption is not strictly correct as evidenced by pr25569.
1717
1718      Returning NULL when iv_analyze_result returns false is safe and
1719      avoids the problems in pr25569 until the iv_analyze_* routines
1720      can be fixed, which is apparently hard and time consuming
1721      according to their author.  */
1722   if (! ok)
1723     return NULL;
1724
1725   if (iv.step == const0_rtx
1726       || iv.mode != iv.extend_mode)
1727     return NULL;
1728
1729   /* Record the insn to split.  */
1730   ivts = XNEW (struct iv_to_split);
1731   ivts->insn = insn;
1732   ivts->base_var = NULL_RTX;
1733   ivts->step = iv.step;
1734   ivts->next = NULL;
1735   ivts->n_loc = 1;
1736   ivts->loc[0] = 1;
1737
1738   return ivts;
1739 }
1740
1741 /* Determines which of insns in LOOP can be optimized.
1742    Return a OPT_INFO struct with the relevant hash tables filled
1743    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1744    is undefined for the return value.  */
1745
1746 static struct opt_info *
1747 analyze_insns_in_loop (struct loop *loop)
1748 {
1749   basic_block *body, bb;
1750   unsigned i;
1751   struct opt_info *opt_info = XCNEW (struct opt_info);
1752   rtx insn;
1753   struct iv_to_split *ivts = NULL;
1754   struct var_to_expand *ves = NULL;
1755   PTR *slot1;
1756   PTR *slot2;
1757   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1758   edge exit;
1759   bool can_apply = false;
1760
1761   iv_analysis_loop_init (loop);
1762
1763   body = get_loop_body (loop);
1764
1765   if (flag_split_ivs_in_unroller)
1766     {
1767       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1768                                               si_info_hash, si_info_eq, free);
1769       opt_info->iv_to_split_head = NULL;
1770       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1771     }
1772
1773   /* Record the loop exit bb and loop preheader before the unrolling.  */
1774   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1775
1776   if (VEC_length (edge, edges) == 1)
1777     {
1778       exit = VEC_index (edge, edges, 0);
1779       if (!(exit->flags & EDGE_COMPLEX))
1780         {
1781           opt_info->loop_exit = split_edge (exit);
1782           can_apply = true;
1783         }
1784     }
1785
1786   if (flag_variable_expansion_in_unroller
1787       && can_apply)
1788     {
1789       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1790                                                         ve_info_hash,
1791                                                         ve_info_eq, free);
1792       opt_info->var_to_expand_head = NULL;
1793       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1794     }
1795
1796   for (i = 0; i < loop->num_nodes; i++)
1797     {
1798       bb = body[i];
1799       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1800         continue;
1801
1802       FOR_BB_INSNS (bb, insn)
1803       {
1804         if (!INSN_P (insn))
1805           continue;
1806
1807         if (opt_info->insns_to_split)
1808           ivts = analyze_iv_to_split_insn (insn);
1809
1810         if (ivts)
1811           {
1812             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1813             gcc_assert (*slot1 == NULL);
1814             *slot1 = ivts;
1815             *opt_info->iv_to_split_tail = ivts;
1816             opt_info->iv_to_split_tail = &ivts->next;
1817             continue;
1818           }
1819
1820         if (opt_info->insns_with_var_to_expand)
1821           ves = analyze_insn_to_expand_var (loop, insn);
1822
1823         if (ves)
1824           {
1825             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1826             gcc_assert (*slot2 == NULL);
1827             *slot2 = ves;
1828             *opt_info->var_to_expand_tail = ves;
1829             opt_info->var_to_expand_tail = &ves->next;
1830           }
1831       }
1832     }
1833
1834   VEC_free (edge, heap, edges);
1835   free (body);
1836   return opt_info;
1837 }
1838
1839 /* Called just before loop duplication.  Records start of duplicated area
1840    to OPT_INFO.  */
1841
1842 static void
1843 opt_info_start_duplication (struct opt_info *opt_info)
1844 {
1845   if (opt_info)
1846     opt_info->first_new_block = last_basic_block;
1847 }
1848
1849 /* Determine the number of iterations between initialization of the base
1850    variable and the current copy (N_COPY).  N_COPIES is the total number
1851    of newly created copies.  UNROLLING is true if we are unrolling
1852    (not peeling) the loop.  */
1853
1854 static unsigned
1855 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1856 {
1857   if (unrolling)
1858     {
1859       /* If we are unrolling, initialization is done in the original loop
1860          body (number 0).  */
1861       return n_copy;
1862     }
1863   else
1864     {
1865       /* If we are peeling, the copy in that the initialization occurs has
1866          number 1.  The original loop (number 0) is the last.  */
1867       if (n_copy)
1868         return n_copy - 1;
1869       else
1870         return n_copies;
1871     }
1872 }
1873
1874 /* Locate in EXPR the expression corresponding to the location recorded
1875    in IVTS, and return a pointer to the RTX for this location.  */
1876
1877 static rtx *
1878 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1879 {
1880   unsigned i;
1881   rtx *ret = &expr;
1882
1883   for (i = 0; i < ivts->n_loc; i++)
1884     ret = &XEXP (*ret, ivts->loc[i]);
1885
1886   return ret;
1887 }
1888
1889 /* Allocate basic variable for the induction variable chain.  */
1890
1891 static void
1892 allocate_basic_variable (struct iv_to_split *ivts)
1893 {
1894   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1895
1896   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1897 }
1898
1899 /* Insert initialization of basic variable of IVTS before INSN, taking
1900    the initial value from INSN.  */
1901
1902 static void
1903 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1904 {
1905   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1906   rtx seq;
1907
1908   start_sequence ();
1909   expr = force_operand (expr, ivts->base_var);
1910   if (expr != ivts->base_var)
1911     emit_move_insn (ivts->base_var, expr);
1912   seq = get_insns ();
1913   end_sequence ();
1914
1915   emit_insn_before (seq, insn);
1916 }
1917
1918 /* Replace the use of induction variable described in IVTS in INSN
1919    by base variable + DELTA * step.  */
1920
1921 static void
1922 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1923 {
1924   rtx expr, *loc, seq, incr, var;
1925   enum machine_mode mode = GET_MODE (ivts->base_var);
1926   rtx src, dest, set;
1927
1928   /* Construct base + DELTA * step.  */
1929   if (!delta)
1930     expr = ivts->base_var;
1931   else
1932     {
1933       incr = simplify_gen_binary (MULT, mode,
1934                                   ivts->step, gen_int_mode (delta, mode));
1935       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1936                                   ivts->base_var, incr);
1937     }
1938
1939   /* Figure out where to do the replacement.  */
1940   loc = get_ivts_expr (single_set (insn), ivts);
1941
1942   /* If we can make the replacement right away, we're done.  */
1943   if (validate_change (insn, loc, expr, 0))
1944     return;
1945
1946   /* Otherwise, force EXPR into a register and try again.  */
1947   start_sequence ();
1948   var = gen_reg_rtx (mode);
1949   expr = force_operand (expr, var);
1950   if (expr != var)
1951     emit_move_insn (var, expr);
1952   seq = get_insns ();
1953   end_sequence ();
1954   emit_insn_before (seq, insn);
1955
1956   if (validate_change (insn, loc, var, 0))
1957     return;
1958
1959   /* The last chance.  Try recreating the assignment in insn
1960      completely from scratch.  */
1961   set = single_set (insn);
1962   gcc_assert (set);
1963
1964   start_sequence ();
1965   *loc = var;
1966   src = copy_rtx (SET_SRC (set));
1967   dest = copy_rtx (SET_DEST (set));
1968   src = force_operand (src, dest);
1969   if (src != dest)
1970     emit_move_insn (dest, src);
1971   seq = get_insns ();
1972   end_sequence ();
1973
1974   emit_insn_before (seq, insn);
1975   delete_insn (insn);
1976 }
1977
1978
1979 /* Return one expansion of the accumulator recorded in struct VE.  */
1980
1981 static rtx
1982 get_expansion (struct var_to_expand *ve)
1983 {
1984   rtx reg;
1985
1986   if (ve->reuse_expansion == 0)
1987     reg = ve->reg;
1988   else
1989     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
1990
1991   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
1992     ve->reuse_expansion = 0;
1993   else
1994     ve->reuse_expansion++;
1995
1996   return reg;
1997 }
1998
1999
2000 /* Given INSN replace the uses of the accumulator recorded in VE
2001    with a new register.  */
2002
2003 static void
2004 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2005 {
2006   rtx new_reg, set;
2007   bool really_new_expansion = false;
2008
2009   set = single_set (insn);
2010   gcc_assert (set);
2011
2012   /* Generate a new register only if the expansion limit has not been
2013      reached.  Else reuse an already existing expansion.  */
2014   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2015     {
2016       really_new_expansion = true;
2017       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2018     }
2019   else
2020     new_reg = get_expansion (ve);
2021
2022   validate_change (insn, &SET_DEST (set), new_reg, 1);
2023   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2024
2025   if (apply_change_group ())
2026     if (really_new_expansion)
2027       {
2028         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2029         ve->expansion_count++;
2030       }
2031 }
2032
2033 /* Initialize the variable expansions in loop preheader.  PLACE is the
2034    loop-preheader basic block where the initialization of the
2035    expansions should take place.  The expansions are initialized with
2036    (-0) when the operation is plus or minus to honor sign zero.  This
2037    way we can prevent cases where the sign of the final result is
2038    effected by the sign of the expansion.  Here is an example to
2039    demonstrate this:
2040
2041    for (i = 0 ; i < n; i++)
2042      sum += something;
2043
2044    ==>
2045
2046    sum += something
2047    ....
2048    i = i+1;
2049    sum1 += something
2050    ....
2051    i = i+1
2052    sum2 += something;
2053    ....
2054
2055    When SUM is initialized with -zero and SOMETHING is also -zero; the
2056    final result of sum should be -zero thus the expansions sum1 and sum2
2057    should be initialized with -zero as well (otherwise we will get +zero
2058    as the final result).  */
2059
2060 static void
2061 insert_var_expansion_initialization (struct var_to_expand *ve,
2062                                      basic_block place)
2063 {
2064   rtx seq, var, zero_init, insn;
2065   unsigned i;
2066   enum machine_mode mode = GET_MODE (ve->reg);
2067   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2068
2069   if (VEC_length (rtx, ve->var_expansions) == 0)
2070     return;
2071
2072   start_sequence ();
2073   if (ve->op == PLUS || ve->op == MINUS)
2074     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2075       {
2076         if (honor_signed_zero_p)
2077           zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2078         else
2079           zero_init = CONST0_RTX (mode);
2080
2081         emit_move_insn (var, zero_init);
2082       }
2083   else if (ve->op == MULT)
2084     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2085       {
2086         zero_init =  CONST1_RTX (GET_MODE (var));
2087         emit_move_insn (var, zero_init);
2088       }
2089
2090   seq = get_insns ();
2091   end_sequence ();
2092
2093   insn = BB_HEAD (place);
2094   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2095     insn = NEXT_INSN (insn);
2096
2097   emit_insn_after (seq, insn);
2098 }
2099
2100 /* Combine the variable expansions at the loop exit.  PLACE is the
2101    loop exit basic block where the summation of the expansions should
2102    take place.  */
2103
2104 static void
2105 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2106 {
2107   rtx sum = ve->reg;
2108   rtx expr, seq, var, insn;
2109   unsigned i;
2110
2111   if (VEC_length (rtx, ve->var_expansions) == 0)
2112     return;
2113
2114   start_sequence ();
2115   if (ve->op == PLUS || ve->op == MINUS)
2116     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2117       {
2118         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg),
2119                                    var, sum);
2120       }
2121   else if (ve->op == MULT)
2122     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2123       {
2124         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg),
2125                                    var, sum);
2126       }
2127
2128   expr = force_operand (sum, ve->reg);
2129   if (expr != ve->reg)
2130     emit_move_insn (ve->reg, expr);
2131   seq = get_insns ();
2132   end_sequence ();
2133
2134   insn = BB_HEAD (place);
2135   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2136     insn = NEXT_INSN (insn);
2137
2138   emit_insn_after (seq, insn);
2139 }
2140
2141 /* Apply loop optimizations in loop copies using the
2142    data which gathered during the unrolling.  Structure
2143    OPT_INFO record that data.
2144
2145    UNROLLING is true if we unrolled (not peeled) the loop.
2146    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2147    the loop (as it should happen in complete unrolling, but not in ordinary
2148    peeling of the loop).  */
2149
2150 static void
2151 apply_opt_in_copies (struct opt_info *opt_info,
2152                      unsigned n_copies, bool unrolling,
2153                      bool rewrite_original_loop)
2154 {
2155   unsigned i, delta;
2156   basic_block bb, orig_bb;
2157   rtx insn, orig_insn, next;
2158   struct iv_to_split ivts_templ, *ivts;
2159   struct var_to_expand ve_templ, *ves;
2160
2161   /* Sanity check -- we need to put initialization in the original loop
2162      body.  */
2163   gcc_assert (!unrolling || rewrite_original_loop);
2164
2165   /* Allocate the basic variables (i0).  */
2166   if (opt_info->insns_to_split)
2167     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2168       allocate_basic_variable (ivts);
2169
2170   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2171     {
2172       bb = BASIC_BLOCK (i);
2173       orig_bb = get_bb_original (bb);
2174
2175       /* bb->aux holds position in copy sequence initialized by
2176          duplicate_loop_to_header_edge.  */
2177       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2178                                         unrolling);
2179       bb->aux = 0;
2180       orig_insn = BB_HEAD (orig_bb);
2181       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2182         {
2183           next = NEXT_INSN (insn);
2184           if (!INSN_P (insn))
2185             continue;
2186
2187           while (!INSN_P (orig_insn))
2188             orig_insn = NEXT_INSN (orig_insn);
2189
2190           ivts_templ.insn = orig_insn;
2191           ve_templ.insn = orig_insn;
2192
2193           /* Apply splitting iv optimization.  */
2194           if (opt_info->insns_to_split)
2195             {
2196               ivts = (struct iv_to_split *)
2197                 htab_find (opt_info->insns_to_split, &ivts_templ);
2198
2199               if (ivts)
2200                 {
2201                   gcc_assert (GET_CODE (PATTERN (insn))
2202                               == GET_CODE (PATTERN (orig_insn)));
2203
2204                   if (!delta)
2205                     insert_base_initialization (ivts, insn);
2206                   split_iv (ivts, insn, delta);
2207                 }
2208             }
2209           /* Apply variable expansion optimization.  */
2210           if (unrolling && opt_info->insns_with_var_to_expand)
2211             {
2212               ves = (struct var_to_expand *)
2213                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2214               if (ves)
2215                 {
2216                   gcc_assert (GET_CODE (PATTERN (insn))
2217                               == GET_CODE (PATTERN (orig_insn)));
2218                   expand_var_during_unrolling (ves, insn);
2219                 }
2220             }
2221           orig_insn = NEXT_INSN (orig_insn);
2222         }
2223     }
2224
2225   if (!rewrite_original_loop)
2226     return;
2227
2228   /* Initialize the variable expansions in the loop preheader
2229      and take care of combining them at the loop exit.  */
2230   if (opt_info->insns_with_var_to_expand)
2231     {
2232       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2233         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2234       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2235         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2236     }
2237
2238   /* Rewrite also the original loop body.  Find them as originals of the blocks
2239      in the last copied iteration, i.e. those that have
2240      get_bb_copy (get_bb_original (bb)) == bb.  */
2241   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2242     {
2243       bb = BASIC_BLOCK (i);
2244       orig_bb = get_bb_original (bb);
2245       if (get_bb_copy (orig_bb) != bb)
2246         continue;
2247
2248       delta = determine_split_iv_delta (0, n_copies, unrolling);
2249       for (orig_insn = BB_HEAD (orig_bb);
2250            orig_insn != NEXT_INSN (BB_END (bb));
2251            orig_insn = next)
2252         {
2253           next = NEXT_INSN (orig_insn);
2254
2255           if (!INSN_P (orig_insn))
2256             continue;
2257
2258           ivts_templ.insn = orig_insn;
2259           if (opt_info->insns_to_split)
2260             {
2261               ivts = (struct iv_to_split *)
2262                 htab_find (opt_info->insns_to_split, &ivts_templ);
2263               if (ivts)
2264                 {
2265                   if (!delta)
2266                     insert_base_initialization (ivts, orig_insn);
2267                   split_iv (ivts, orig_insn, delta);
2268                   continue;
2269                 }
2270             }
2271
2272         }
2273     }
2274 }
2275
2276 /* Release OPT_INFO.  */
2277
2278 static void
2279 free_opt_info (struct opt_info *opt_info)
2280 {
2281   if (opt_info->insns_to_split)
2282     htab_delete (opt_info->insns_to_split);
2283   if (opt_info->insns_with_var_to_expand)
2284     {
2285       struct var_to_expand *ves;
2286
2287       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2288         VEC_free (rtx, heap, ves->var_expansions);
2289       htab_delete (opt_info->insns_with_var_to_expand);
2290     }
2291   free (opt_info);
2292 }