gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "expr.h"
  33 #include "hashtab.h"
  34 #include "recog.h"
  35 #include "target.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx base_var;         /* The variable on that the values in the further
  78                            iterations are based.  */
  79   rtx step;             /* Step of the induction variable.  */
  80   struct iv_to_split *next; /* Next entry in walking order.  */
  81   unsigned n_loc;
  82   unsigned loc[3];      /* Location where the definition of the induction
  83                            variable occurs in the insn.  For example if
  84                            N_LOC is 2, the expression is located at
  85                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  86 };
  87
  88 /* Information about accumulators to expand.  */
  89
  90 struct var_to_expand
  91 {
  92   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  93   rtx reg;                         /* The accumulator which is expanded.  */
  94   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  95   struct var_to_expand *next;      /* Next entry in walking order.  */
  96   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  97                                       or multiplication.  */
  98   int expansion_count;             /* Count the number of expansions generated so far.  */
  99   int reuse_expansion;             /* The expansion we intend to reuse to expand
 100                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 101                                       the original accumulator.  Else use
 102                                       var_expansions[REUSE_EXPANSION - 1].  */
 103   unsigned accum_pos;              /* The position in which the accumulator is placed in
 104                                       the insn src.  For example in x = x + something
 105                                       accum_pos is 0 while in x = something + x accum_pos
 106                                       is 1.  */
 107 };
 108
 109 /* Information about optimization applied in
 110    the unrolled loop.  */
 111
 112 struct opt_info
 113 {
 114   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 115   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 116   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 117   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 118                                       to expand.  */
 119   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 120   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 121   unsigned first_new_block;        /* The first basic block that was
 122                                       duplicated.  */
 123   basic_block loop_exit;           /* The loop exit basic block.  */
 124   basic_block loop_preheader;      /* The loop preheader basic block.  */
 125 };
 126
 127 static void decide_unrolling_and_peeling (int);
 128 static void peel_loops_completely (int);
 129 static void decide_peel_simple (struct loop *, int);
 130 static void decide_peel_once_rolling (struct loop *, int);
 131 static void decide_peel_completely (struct loop *, int);
 132 static void decide_unroll_stupid (struct loop *, int);
 133 static void decide_unroll_constant_iterations (struct loop *, int);
 134 static void decide_unroll_runtime_iterations (struct loop *, int);
 135 static void peel_loop_simple (struct loop *);
 136 static void peel_loop_completely (struct loop *);
 137 static void unroll_loop_stupid (struct loop *);
 138 static void unroll_loop_constant_iterations (struct loop *);
 139 static void unroll_loop_runtime_iterations (struct loop *);
 140 static struct opt_info *analyze_insns_in_loop (struct loop *);
 141 static void opt_info_start_duplication (struct opt_info *);
 142 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 143 static void free_opt_info (struct opt_info *);
 144 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 145 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 146 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 147 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 148 static void insert_var_expansion_initialization (struct var_to_expand *,
 149                                                  basic_block);
 150 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 151                                              basic_block);
 152 static rtx get_expansion (struct var_to_expand *);
 153
 154 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 155 void
 156 unroll_and_peel_loops (int flags)
 157 {
 158   struct loop *loop;
 159   bool check;
 160   loop_iterator li;
 161
 162   /* First perform complete loop peeling (it is almost surely a win,
 163      and affects parameters for further decision a lot).  */
 164   peel_loops_completely (flags);
 165
 166   /* Now decide rest of unrolling and peeling.  */
 167   decide_unrolling_and_peeling (flags);
 168
 169   /* Scan the loops, inner ones first.  */
 170   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 171     {
 172       check = true;
 173       /* And perform the appropriate transformations.  */
 174       switch (loop->lpt_decision.decision)
 175         {
 176         case LPT_PEEL_COMPLETELY:
 177           /* Already done.  */
 178           gcc_unreachable ();
 179         case LPT_PEEL_SIMPLE:
 180           peel_loop_simple (loop);
 181           break;
 182         case LPT_UNROLL_CONSTANT:
 183           unroll_loop_constant_iterations (loop);
 184           break;
 185         case LPT_UNROLL_RUNTIME:
 186           unroll_loop_runtime_iterations (loop);
 187           break;
 188         case LPT_UNROLL_STUPID:
 189           unroll_loop_stupid (loop);
 190           break;
 191         case LPT_NONE:
 192           check = false;
 193           break;
 194         default:
 195           gcc_unreachable ();
 196         }
 197       if (check)
 198         {
 199 #ifdef ENABLE_CHECKING
 200           verify_loop_structure ();
 201 #endif
 202         }
 203     }
 204
 205   iv_analysis_done ();
 206 }
 207
 208 /* Check whether exit of the LOOP is at the end of loop body.  */
 209
 210 static bool
 211 loop_exit_at_end_p (struct loop *loop)
 212 {
 213   struct niter_desc *desc = get_simple_loop_desc (loop);
 214   rtx insn;
 215
 216   if (desc->in_edge->dest != loop->latch)
 217     return false;
 218
 219   /* Check that the latch is empty.  */
 220   FOR_BB_INSNS (loop->latch, insn)
 221     {
 222       if (INSN_P (insn))
 223         return false;
 224     }
 225
 226   return true;
 227 }
 228
 229 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 230 static void
 231 peel_loops_completely (int flags)
 232 {
 233   struct loop *loop;
 234   loop_iterator li;
 235
 236   /* Scan the loops, the inner ones first.  */
 237   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 238     {
 239       loop->lpt_decision.decision = LPT_NONE;
 240
 241       if (dump_file)
 242         fprintf (dump_file,
 243                  "\n;; *** Considering loop %d for complete peeling ***\n",
 244                  loop->num);
 245
 246       loop->ninsns = num_loop_insns (loop);
 247
 248       decide_peel_once_rolling (loop, flags);
 249       if (loop->lpt_decision.decision == LPT_NONE)
 250         decide_peel_completely (loop, flags);
 251
 252       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 253         {
 254           peel_loop_completely (loop);
 255 #ifdef ENABLE_CHECKING
 256           verify_loop_structure ();
 257 #endif
 258         }
 259     }
 260 }
 261
 262 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 263 static void
 264 decide_unrolling_and_peeling (int flags)
 265 {
 266   struct loop *loop;
 267   loop_iterator li;
 268
 269   /* Scan the loops, inner ones first.  */
 270   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 271     {
 272       loop->lpt_decision.decision = LPT_NONE;
 273
 274       if (dump_file)
 275         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 276
 277       /* Do not peel cold areas.  */
 278       if (optimize_loop_for_size_p (loop))
 279         {
 280           if (dump_file)
 281             fprintf (dump_file, ";; Not considering loop, cold area\n");
 282           continue;
 283         }
 284
 285       /* Can the loop be manipulated?  */
 286       if (!can_duplicate_loop_p (loop))
 287         {
 288           if (dump_file)
 289             fprintf (dump_file,
 290                      ";; Not considering loop, cannot duplicate\n");
 291           continue;
 292         }
 293
 294       /* Skip non-innermost loops.  */
 295       if (loop->inner)
 296         {
 297           if (dump_file)
 298             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 299           continue;
 300         }
 301
 302       loop->ninsns = num_loop_insns (loop);
 303       loop->av_ninsns = average_num_loop_insns (loop);
 304
 305       /* Try transformations one by one in decreasing order of
 306          priority.  */
 307
 308       decide_unroll_constant_iterations (loop, flags);
 309       if (loop->lpt_decision.decision == LPT_NONE)
 310         decide_unroll_runtime_iterations (loop, flags);
 311       if (loop->lpt_decision.decision == LPT_NONE)
 312         decide_unroll_stupid (loop, flags);
 313       if (loop->lpt_decision.decision == LPT_NONE)
 314         decide_peel_simple (loop, flags);
 315     }
 316 }
 317
 318 /* Decide whether the LOOP is once rolling and suitable for complete
 319    peeling.  */
 320 static void
 321 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 322 {
 323   struct niter_desc *desc;
 324
 325   if (dump_file)
 326     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 327
 328   /* Is the loop small enough?  */
 329   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 330     {
 331       if (dump_file)
 332         fprintf (dump_file, ";; Not considering loop, is too big\n");
 333       return;
 334     }
 335
 336   /* Check for simple loops.  */
 337   desc = get_simple_loop_desc (loop);
 338
 339   /* Check number of iterations.  */
 340   if (!desc->simple_p
 341       || desc->assumptions
 342       || desc->infinite
 343       || !desc->const_iter
 344       || desc->niter != 0)
 345     {
 346       if (dump_file)
 347         fprintf (dump_file,
 348                  ";; Unable to prove that the loop rolls exactly once\n");
 349       return;
 350     }
 351
 352   /* Success.  */
 353   if (dump_file)
 354     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 355   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 356 }
 357
 358 /* Decide whether the LOOP is suitable for complete peeling.  */
 359 static void
 360 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 361 {
 362   unsigned npeel;
 363   struct niter_desc *desc;
 364
 365   if (dump_file)
 366     fprintf (dump_file, "\n;; Considering peeling completely\n");
 367
 368   /* Skip non-innermost loops.  */
 369   if (loop->inner)
 370     {
 371       if (dump_file)
 372         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 373       return;
 374     }
 375
 376   /* Do not peel cold areas.  */
 377   if (optimize_loop_for_size_p (loop))
 378     {
 379       if (dump_file)
 380         fprintf (dump_file, ";; Not considering loop, cold area\n");
 381       return;
 382     }
 383
 384   /* Can the loop be manipulated?  */
 385   if (!can_duplicate_loop_p (loop))
 386     {
 387       if (dump_file)
 388         fprintf (dump_file,
 389                  ";; Not considering loop, cannot duplicate\n");
 390       return;
 391     }
 392
 393   /* npeel = number of iterations to peel.  */
 394   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 395   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 396     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 397
 398   /* Is the loop small enough?  */
 399   if (!npeel)
 400     {
 401       if (dump_file)
 402         fprintf (dump_file, ";; Not considering loop, is too big\n");
 403       return;
 404     }
 405
 406   /* Check for simple loops.  */
 407   desc = get_simple_loop_desc (loop);
 408
 409   /* Check number of iterations.  */
 410   if (!desc->simple_p
 411       || desc->assumptions
 412       || !desc->const_iter
 413       || desc->infinite)
 414     {
 415       if (dump_file)
 416         fprintf (dump_file,
 417                  ";; Unable to prove that the loop iterates constant times\n");
 418       return;
 419     }
 420
 421   if (desc->niter > npeel - 1)
 422     {
 423       if (dump_file)
 424         {
 425           fprintf (dump_file,
 426                    ";; Not peeling loop completely, rolls too much (");
 427           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 428           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 429         }
 430       return;
 431     }
 432
 433   /* Success.  */
 434   if (dump_file)
 435     fprintf (dump_file, ";; Decided to peel loop completely\n");
 436   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 437 }
 438
 439 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 440    completely.  The transformation done:
 441
 442    for (i = 0; i < 4; i++)
 443      body;
 444
 445    ==>
 446
 447    i = 0;
 448    body; i++;
 449    body; i++;
 450    body; i++;
 451    body; i++;
 452    */
 453 static void
 454 peel_loop_completely (struct loop *loop)
 455 {
 456   sbitmap wont_exit;
 457   unsigned HOST_WIDE_INT npeel;
 458   unsigned i;
 459   VEC (edge, heap) *remove_edges;
 460   edge ein;
 461   struct niter_desc *desc = get_simple_loop_desc (loop);
 462   struct opt_info *opt_info = NULL;
 463
 464   npeel = desc->niter;
 465
 466   if (npeel)
 467     {
 468       bool ok;
 469
 470       wont_exit = sbitmap_alloc (npeel + 1);
 471       sbitmap_ones (wont_exit);
 472       RESET_BIT (wont_exit, 0);
 473       if (desc->noloop_assumptions)
 474         RESET_BIT (wont_exit, 1);
 475
 476       remove_edges = NULL;
 477
 478       if (flag_split_ivs_in_unroller)
 479         opt_info = analyze_insns_in_loop (loop);
 480
 481       opt_info_start_duplication (opt_info);
 482       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 483                                           npeel,
 484                                           wont_exit, desc->out_edge,
 485                                           &remove_edges,
 486                                           DLTHE_FLAG_UPDATE_FREQ
 487                                           | DLTHE_FLAG_COMPLETTE_PEEL
 488                                           | (opt_info
 489                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 490       gcc_assert (ok);
 491
 492       free (wont_exit);
 493
 494       if (opt_info)
 495         {
 496           apply_opt_in_copies (opt_info, npeel, false, true);
 497           free_opt_info (opt_info);
 498         }
 499
 500       /* Remove the exit edges.  */
 501       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 502         remove_path (ein);
 503       VEC_free (edge, heap, remove_edges);
 504     }
 505
 506   ein = desc->in_edge;
 507   free_simple_loop_desc (loop);
 508
 509   /* Now remove the unreachable part of the last iteration and cancel
 510      the loop.  */
 511   remove_path (ein);
 512
 513   if (dump_file)
 514     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 515 }
 516
 517 /* Decide whether to unroll LOOP iterating constant number of times
 518    and how much.  */
 519
 520 static void
 521 decide_unroll_constant_iterations (struct loop *loop, int flags)
 522 {
 523   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 524   struct niter_desc *desc;
 525
 526   if (!(flags & UAP_UNROLL))
 527     {
 528       /* We were not asked to, just return back silently.  */
 529       return;
 530     }
 531
 532   if (dump_file)
 533     fprintf (dump_file,
 534              "\n;; Considering unrolling loop with constant "
 535              "number of iterations\n");
 536
 537   /* nunroll = total number of copies of the original loop body in
 538      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 539   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 540   nunroll_by_av
 541     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 542   if (nunroll > nunroll_by_av)
 543     nunroll = nunroll_by_av;
 544   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 545     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 546
 547   /* Skip big loops.  */
 548   if (nunroll <= 1)
 549     {
 550       if (dump_file)
 551         fprintf (dump_file, ";; Not considering loop, is too big\n");
 552       return;
 553     }
 554
 555   /* Check for simple loops.  */
 556   desc = get_simple_loop_desc (loop);
 557
 558   /* Check number of iterations.  */
 559   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 560     {
 561       if (dump_file)
 562         fprintf (dump_file,
 563                  ";; Unable to prove that the loop iterates constant times\n");
 564       return;
 565     }
 566
 567   /* Check whether the loop rolls enough to consider.  */
 568   if (desc->niter < 2 * nunroll)
 569     {
 570       if (dump_file)
 571         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 572       return;
 573     }
 574
 575   /* Success; now compute number of iterations to unroll.  We alter
 576      nunroll so that as few as possible copies of loop body are
 577      necessary, while still not decreasing the number of unrollings
 578      too much (at most by 1).  */
 579   best_copies = 2 * nunroll + 10;
 580
 581   i = 2 * nunroll + 2;
 582   if (i - 1 >= desc->niter)
 583     i = desc->niter - 2;
 584
 585   for (; i >= nunroll - 1; i--)
 586     {
 587       unsigned exit_mod = desc->niter % (i + 1);
 588
 589       if (!loop_exit_at_end_p (loop))
 590         n_copies = exit_mod + i + 1;
 591       else if (exit_mod != (unsigned) i
 592                || desc->noloop_assumptions != NULL_RTX)
 593         n_copies = exit_mod + i + 2;
 594       else
 595         n_copies = i + 1;
 596
 597       if (n_copies < best_copies)
 598         {
 599           best_copies = n_copies;
 600           best_unroll = i;
 601         }
 602     }
 603
 604   if (dump_file)
 605     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 606              best_unroll + 1, best_copies, nunroll);
 607
 608   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 609   loop->lpt_decision.times = best_unroll;
 610
 611   if (dump_file)
 612     fprintf (dump_file,
 613              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 614              loop->lpt_decision.times);
 615 }
 616
 617 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 618    times.  The transformation does this:
 619
 620    for (i = 0; i < 102; i++)
 621      body;
 622
 623    ==>
 624
 625    i = 0;
 626    body; i++;
 627    body; i++;
 628    while (i < 102)
 629      {
 630        body; i++;
 631        body; i++;
 632        body; i++;
 633        body; i++;
 634      }
 635   */
 636 static void
 637 unroll_loop_constant_iterations (struct loop *loop)
 638 {
 639   unsigned HOST_WIDE_INT niter;
 640   unsigned exit_mod;
 641   sbitmap wont_exit;
 642   unsigned i;
 643   VEC (edge, heap) *remove_edges;
 644   edge e;
 645   unsigned max_unroll = loop->lpt_decision.times;
 646   struct niter_desc *desc = get_simple_loop_desc (loop);
 647   bool exit_at_end = loop_exit_at_end_p (loop);
 648   struct opt_info *opt_info = NULL;
 649   bool ok;
 650
 651   niter = desc->niter;
 652
 653   /* Should not get here (such loop should be peeled instead).  */
 654   gcc_assert (niter > max_unroll + 1);
 655
 656   exit_mod = niter % (max_unroll + 1);
 657
 658   wont_exit = sbitmap_alloc (max_unroll + 1);
 659   sbitmap_ones (wont_exit);
 660
 661   remove_edges = NULL;
 662   if (flag_split_ivs_in_unroller
 663       || flag_variable_expansion_in_unroller)
 664     opt_info = analyze_insns_in_loop (loop);
 665
 666   if (!exit_at_end)
 667     {
 668       /* The exit is not at the end of the loop; leave exit test
 669          in the first copy, so that the loops that start with test
 670          of exit condition have continuous body after unrolling.  */
 671
 672       if (dump_file)
 673         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 674
 675       /* Peel exit_mod iterations.  */
 676       RESET_BIT (wont_exit, 0);
 677       if (desc->noloop_assumptions)
 678         RESET_BIT (wont_exit, 1);
 679
 680       if (exit_mod)
 681         {
 682           opt_info_start_duplication (opt_info);
 683           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 684                                               exit_mod,
 685                                               wont_exit, desc->out_edge,
 686                                               &remove_edges,
 687                                               DLTHE_FLAG_UPDATE_FREQ
 688                                               | (opt_info && exit_mod > 1
 689                                                  ? DLTHE_RECORD_COPY_NUMBER
 690                                                    : 0));
 691           gcc_assert (ok);
 692
 693           if (opt_info && exit_mod > 1)
 694             apply_opt_in_copies (opt_info, exit_mod, false, false);
 695
 696           desc->noloop_assumptions = NULL_RTX;
 697           desc->niter -= exit_mod;
 698           desc->niter_max -= exit_mod;
 699         }
 700
 701       SET_BIT (wont_exit, 1);
 702     }
 703   else
 704     {
 705       /* Leave exit test in last copy, for the same reason as above if
 706          the loop tests the condition at the end of loop body.  */
 707
 708       if (dump_file)
 709         fprintf (dump_file, ";; Condition on end of loop.\n");
 710
 711       /* We know that niter >= max_unroll + 2; so we do not need to care of
 712          case when we would exit before reaching the loop.  So just peel
 713          exit_mod + 1 iterations.  */
 714       if (exit_mod != max_unroll
 715           || desc->noloop_assumptions)
 716         {
 717           RESET_BIT (wont_exit, 0);
 718           if (desc->noloop_assumptions)
 719             RESET_BIT (wont_exit, 1);
 720
 721           opt_info_start_duplication (opt_info);
 722           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 723                                               exit_mod + 1,
 724                                               wont_exit, desc->out_edge,
 725                                               &remove_edges,
 726                                               DLTHE_FLAG_UPDATE_FREQ
 727                                               | (opt_info && exit_mod > 0
 728                                                  ? DLTHE_RECORD_COPY_NUMBER
 729                                                    : 0));
 730           gcc_assert (ok);
 731
 732           if (opt_info && exit_mod > 0)
 733             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 734
 735           desc->niter -= exit_mod + 1;
 736           desc->niter_max -= exit_mod + 1;
 737           desc->noloop_assumptions = NULL_RTX;
 738
 739           SET_BIT (wont_exit, 0);
 740           SET_BIT (wont_exit, 1);
 741         }
 742
 743       RESET_BIT (wont_exit, max_unroll);
 744     }
 745
 746   /* Now unroll the loop.  */
 747
 748   opt_info_start_duplication (opt_info);
 749   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 750                                       max_unroll,
 751                                       wont_exit, desc->out_edge,
 752                                       &remove_edges,
 753                                       DLTHE_FLAG_UPDATE_FREQ
 754                                       | (opt_info
 755                                          ? DLTHE_RECORD_COPY_NUMBER
 756                                            : 0));
 757   gcc_assert (ok);
 758
 759   if (opt_info)
 760     {
 761       apply_opt_in_copies (opt_info, max_unroll, true, true);
 762       free_opt_info (opt_info);
 763     }
 764
 765   free (wont_exit);
 766
 767   if (exit_at_end)
 768     {
 769       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 770       /* Find a new in and out edge; they are in the last copy we have made.  */
 771
 772       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 773         {
 774           desc->out_edge = EDGE_SUCC (exit_block, 0);
 775           desc->in_edge = EDGE_SUCC (exit_block, 1);
 776         }
 777       else
 778         {
 779           desc->out_edge = EDGE_SUCC (exit_block, 1);
 780           desc->in_edge = EDGE_SUCC (exit_block, 0);
 781         }
 782     }
 783
 784   desc->niter /= max_unroll + 1;
 785   desc->niter_max /= max_unroll + 1;
 786   desc->niter_expr = GEN_INT (desc->niter);
 787
 788   /* Remove the edges.  */
 789   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 790     remove_path (e);
 791   VEC_free (edge, heap, remove_edges);
 792
 793   if (dump_file)
 794     fprintf (dump_file,
 795              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 796              max_unroll, num_loop_insns (loop));
 797 }
 798
 799 /* Decide whether to unroll LOOP iterating runtime computable number of times
 800    and how much.  */
 801 static void
 802 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 803 {
 804   unsigned nunroll, nunroll_by_av, i;
 805   struct niter_desc *desc;
 806
 807   if (!(flags & UAP_UNROLL))
 808     {
 809       /* We were not asked to, just return back silently.  */
 810       return;
 811     }
 812
 813   if (dump_file)
 814     fprintf (dump_file,
 815              "\n;; Considering unrolling loop with runtime "
 816              "computable number of iterations\n");
 817
 818   /* nunroll = total number of copies of the original loop body in
 819      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 820   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 821   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 822   if (nunroll > nunroll_by_av)
 823     nunroll = nunroll_by_av;
 824   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 825     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 826
 827   if (targetm.loop_unroll_adjust)
 828     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 829
 830   /* Skip big loops.  */
 831   if (nunroll <= 1)
 832     {
 833       if (dump_file)
 834         fprintf (dump_file, ";; Not considering loop, is too big\n");
 835       return;
 836     }
 837
 838   /* Check for simple loops.  */
 839   desc = get_simple_loop_desc (loop);
 840
 841   /* Check simpleness.  */
 842   if (!desc->simple_p || desc->assumptions)
 843     {
 844       if (dump_file)
 845         fprintf (dump_file,
 846                  ";; Unable to prove that the number of iterations "
 847                  "can be counted in runtime\n");
 848       return;
 849     }
 850
 851   if (desc->const_iter)
 852     {
 853       if (dump_file)
 854         fprintf (dump_file, ";; Loop iterates constant times\n");
 855       return;
 856     }
 857
 858   /* If we have profile feedback, check whether the loop rolls.  */
 859   if ((loop->header->count
 860        && expected_loop_iterations (loop) < 2 * nunroll)
 861       || desc->niter_max < 2 * nunroll)
 862     {
 863       if (dump_file)
 864         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 865       return;
 866     }
 867
 868   /* Success; now force nunroll to be power of 2, as we are unable to
 869      cope with overflows in computation of number of iterations.  */
 870   for (i = 1; 2 * i <= nunroll; i *= 2)
 871     continue;
 872
 873   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 874   loop->lpt_decision.times = i - 1;
 875
 876   if (dump_file)
 877     fprintf (dump_file,
 878              ";; Decided to unroll the runtime computable "
 879              "times rolling loop, %d times.\n",
 880              loop->lpt_decision.times);
 881 }
 882
 883 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 884    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 885    and NULL is returned instead.  */
 886
 887 basic_block
 888 split_edge_and_insert (edge e, rtx insns)
 889 {
 890   basic_block bb;
 891
 892   if (!insns)
 893     return NULL;
 894   bb = split_edge (e);
 895   emit_insn_after (insns, BB_END (bb));
 896
 897   /* ??? We used to assume that INSNS can contain control flow insns, and
 898      that we had to try to find sub basic blocks in BB to maintain a valid
 899      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 900      and call break_superblocks when going out of cfglayout mode.  But it
 901      turns out that this never happens; and that if it does ever happen,
 902      the TODO_verify_flow at the end of the RTL loop passes would fail.
 903
 904      There are two reasons why we expected we could have control flow insns
 905      in INSNS.  The first is when a comparison has to be done in parts, and
 906      the second is when the number of iterations is computed for loops with
 907      the number of iterations known at runtime.  In both cases, test cases
 908      to get control flow in INSNS appear to be impossible to construct:
 909
 910       * If do_compare_rtx_and_jump needs several branches to do comparison
 911         in a mode that needs comparison by parts, we cannot analyze the
 912         number of iterations of the loop, and we never get to unrolling it.
 913
 914       * The code in expand_divmod that was suspected to cause creation of
 915         branching code seems to be only accessed for signed division.  The
 916         divisions used by # of iterations analysis are always unsigned.
 917         Problems might arise on architectures that emits branching code
 918         for some operations that may appear in the unroller (especially
 919         for division), but we have no such architectures.
 920
 921      Considering all this, it was decided that we should for now assume
 922      that INSNS can in theory contain control flow insns, but in practice
 923      it never does.  So we don't handle the theoretical case, and should
 924      a real failure ever show up, we have a pretty good clue for how to
 925      fix it.  */
 926
 927   return bb;
 928 }
 929
 930 /* Unroll LOOP for that we are able to count number of iterations in runtime
 931    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 932    extra care for case n < 0):
 933
 934    for (i = 0; i < n; i++)
 935      body;
 936
 937    ==>
 938
 939    i = 0;
 940    mod = n % 4;
 941
 942    switch (mod)
 943      {
 944        case 3:
 945          body; i++;
 946        case 2:
 947          body; i++;
 948        case 1:
 949          body; i++;
 950        case 0: ;
 951      }
 952
 953    while (i < n)
 954      {
 955        body; i++;
 956        body; i++;
 957        body; i++;
 958        body; i++;
 959      }
 960    */
 961 static void
 962 unroll_loop_runtime_iterations (struct loop *loop)
 963 {
 964   rtx old_niter, niter, init_code, branch_code, tmp;
 965   unsigned i, j, p;
 966   basic_block preheader, *body, swtch, ezc_swtch;
 967   VEC (basic_block, heap) *dom_bbs;
 968   sbitmap wont_exit;
 969   int may_exit_copy;
 970   unsigned n_peel;
 971   VEC (edge, heap) *remove_edges;
 972   edge e;
 973   bool extra_zero_check, last_may_exit;
 974   unsigned max_unroll = loop->lpt_decision.times;
 975   struct niter_desc *desc = get_simple_loop_desc (loop);
 976   bool exit_at_end = loop_exit_at_end_p (loop);
 977   struct opt_info *opt_info = NULL;
 978   bool ok;
 979
 980   if (flag_split_ivs_in_unroller
 981       || flag_variable_expansion_in_unroller)
 982     opt_info = analyze_insns_in_loop (loop);
 983
 984   /* Remember blocks whose dominators will have to be updated.  */
 985   dom_bbs = NULL;
 986
 987   body = get_loop_body (loop);
 988   for (i = 0; i < loop->num_nodes; i++)
 989     {
 990       VEC (basic_block, heap) *ldom;
 991       basic_block bb;
 992
 993       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 994       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
 995         if (!flow_bb_inside_loop_p (loop, bb))
 996           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 997
 998       VEC_free (basic_block, heap, ldom);
 999     }
1000   free (body);
1001
1002   if (!exit_at_end)
1003     {
1004       /* Leave exit in first copy (for explanation why see comment in
1005          unroll_loop_constant_iterations).  */
1006       may_exit_copy = 0;
1007       n_peel = max_unroll - 1;
1008       extra_zero_check = true;
1009       last_may_exit = false;
1010     }
1011   else
1012     {
1013       /* Leave exit in last copy (for explanation why see comment in
1014          unroll_loop_constant_iterations).  */
1015       may_exit_copy = max_unroll;
1016       n_peel = max_unroll;
1017       extra_zero_check = false;
1018       last_may_exit = true;
1019     }
1020
1021   /* Get expression for number of iterations.  */
1022   start_sequence ();
1023   old_niter = niter = gen_reg_rtx (desc->mode);
1024   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1025   if (tmp != niter)
1026     emit_move_insn (niter, tmp);
1027
1028   /* Count modulo by ANDing it with max_unroll; we use the fact that
1029      the number of unrollings is a power of two, and thus this is correct
1030      even if there is overflow in the computation.  */
1031   niter = expand_simple_binop (desc->mode, AND,
1032                                niter,
1033                                GEN_INT (max_unroll),
1034                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1035
1036   init_code = get_insns ();
1037   end_sequence ();
1038   unshare_all_rtl_in_chain (init_code);
1039
1040   /* Precondition the loop.  */
1041   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1042
1043   remove_edges = NULL;
1044
1045   wont_exit = sbitmap_alloc (max_unroll + 2);
1046
1047   /* Peel the first copy of loop body (almost always we must leave exit test
1048      here; the only exception is when we have extra zero check and the number
1049      of iterations is reliable.  Also record the place of (possible) extra
1050      zero check.  */
1051   sbitmap_zero (wont_exit);
1052   if (extra_zero_check
1053       && !desc->noloop_assumptions)
1054     SET_BIT (wont_exit, 1);
1055   ezc_swtch = loop_preheader_edge (loop)->src;
1056   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1057                                       1, wont_exit, desc->out_edge,
1058                                       &remove_edges,
1059                                       DLTHE_FLAG_UPDATE_FREQ);
1060   gcc_assert (ok);
1061
1062   /* Record the place where switch will be built for preconditioning.  */
1063   swtch = split_edge (loop_preheader_edge (loop));
1064
1065   for (i = 0; i < n_peel; i++)
1066     {
1067       /* Peel the copy.  */
1068       sbitmap_zero (wont_exit);
1069       if (i != n_peel - 1 || !last_may_exit)
1070         SET_BIT (wont_exit, 1);
1071       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1072                                           1, wont_exit, desc->out_edge,
1073                                           &remove_edges,
1074                                           DLTHE_FLAG_UPDATE_FREQ);
1075       gcc_assert (ok);
1076
1077       /* Create item for switch.  */
1078       j = n_peel - i - (extra_zero_check ? 0 : 1);
1079       p = REG_BR_PROB_BASE / (i + 2);
1080
1081       preheader = split_edge (loop_preheader_edge (loop));
1082       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1083                                           block_label (preheader), p,
1084                                           NULL_RTX);
1085
1086       /* We rely on the fact that the compare and jump cannot be optimized out,
1087          and hence the cfg we create is correct.  */
1088       gcc_assert (branch_code != NULL_RTX);
1089
1090       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1091       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1092       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1093       e = make_edge (swtch, preheader,
1094                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1095       e->probability = p;
1096     }
1097
1098   if (extra_zero_check)
1099     {
1100       /* Add branch for zero iterations.  */
1101       p = REG_BR_PROB_BASE / (max_unroll + 1);
1102       swtch = ezc_swtch;
1103       preheader = split_edge (loop_preheader_edge (loop));
1104       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1105                                           block_label (preheader), p,
1106                                           NULL_RTX);
1107       gcc_assert (branch_code != NULL_RTX);
1108
1109       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1110       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1111       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1112       e = make_edge (swtch, preheader,
1113                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1114       e->probability = p;
1115     }
1116
1117   /* Recount dominators for outer blocks.  */
1118   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1119
1120   /* And unroll loop.  */
1121
1122   sbitmap_ones (wont_exit);
1123   RESET_BIT (wont_exit, may_exit_copy);
1124   opt_info_start_duplication (opt_info);
1125
1126   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1127                                       max_unroll,
1128                                       wont_exit, desc->out_edge,
1129                                       &remove_edges,
1130                                       DLTHE_FLAG_UPDATE_FREQ
1131                                       | (opt_info
1132                                          ? DLTHE_RECORD_COPY_NUMBER
1133                                            : 0));
1134   gcc_assert (ok);
1135
1136   if (opt_info)
1137     {
1138       apply_opt_in_copies (opt_info, max_unroll, true, true);
1139       free_opt_info (opt_info);
1140     }
1141
1142   free (wont_exit);
1143
1144   if (exit_at_end)
1145     {
1146       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1147       /* Find a new in and out edge; they are in the last copy we have
1148          made.  */
1149
1150       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1151         {
1152           desc->out_edge = EDGE_SUCC (exit_block, 0);
1153           desc->in_edge = EDGE_SUCC (exit_block, 1);
1154         }
1155       else
1156         {
1157           desc->out_edge = EDGE_SUCC (exit_block, 1);
1158           desc->in_edge = EDGE_SUCC (exit_block, 0);
1159         }
1160     }
1161
1162   /* Remove the edges.  */
1163   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1164     remove_path (e);
1165   VEC_free (edge, heap, remove_edges);
1166
1167   /* We must be careful when updating the number of iterations due to
1168      preconditioning and the fact that the value must be valid at entry
1169      of the loop.  After passing through the above code, we see that
1170      the correct new number of iterations is this:  */
1171   gcc_assert (!desc->const_iter);
1172   desc->niter_expr =
1173     simplify_gen_binary (UDIV, desc->mode, old_niter,
1174                          GEN_INT (max_unroll + 1));
1175   desc->niter_max /= max_unroll + 1;
1176   if (exit_at_end)
1177     {
1178       desc->niter_expr =
1179         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1180       desc->noloop_assumptions = NULL_RTX;
1181       desc->niter_max--;
1182     }
1183
1184   if (dump_file)
1185     fprintf (dump_file,
1186              ";; Unrolled loop %d times, counting # of iterations "
1187              "in runtime, %i insns\n",
1188              max_unroll, num_loop_insns (loop));
1189
1190   VEC_free (basic_block, heap, dom_bbs);
1191 }
1192
1193 /* Decide whether to simply peel LOOP and how much.  */
1194 static void
1195 decide_peel_simple (struct loop *loop, int flags)
1196 {
1197   unsigned npeel;
1198   struct niter_desc *desc;
1199
1200   if (!(flags & UAP_PEEL))
1201     {
1202       /* We were not asked to, just return back silently.  */
1203       return;
1204     }
1205
1206   if (dump_file)
1207     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1208
1209   /* npeel = number of iterations to peel.  */
1210   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1211   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1212     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1213
1214   /* Skip big loops.  */
1215   if (!npeel)
1216     {
1217       if (dump_file)
1218         fprintf (dump_file, ";; Not considering loop, is too big\n");
1219       return;
1220     }
1221
1222   /* Check for simple loops.  */
1223   desc = get_simple_loop_desc (loop);
1224
1225   /* Check number of iterations.  */
1226   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1227     {
1228       if (dump_file)
1229         fprintf (dump_file, ";; Loop iterates constant times\n");
1230       return;
1231     }
1232
1233   /* Do not simply peel loops with branches inside -- it increases number
1234      of mispredicts.  */
1235   if (num_loop_branches (loop) > 1)
1236     {
1237       if (dump_file)
1238         fprintf (dump_file, ";; Not peeling, contains branches\n");
1239       return;
1240     }
1241
1242   if (loop->header->count)
1243     {
1244       unsigned niter = expected_loop_iterations (loop);
1245       if (niter + 1 > npeel)
1246         {
1247           if (dump_file)
1248             {
1249               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1250               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1251                        (HOST_WIDEST_INT) (niter + 1));
1252               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1253                        npeel);
1254             }
1255           return;
1256         }
1257       npeel = niter + 1;
1258     }
1259   else
1260     {
1261       /* For now we have no good heuristics to decide whether loop peeling
1262          will be effective, so disable it.  */
1263       if (dump_file)
1264         fprintf (dump_file,
1265                  ";; Not peeling loop, no evidence it will be profitable\n");
1266       return;
1267     }
1268
1269   /* Success.  */
1270   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1271   loop->lpt_decision.times = npeel;
1272
1273   if (dump_file)
1274     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1275              loop->lpt_decision.times);
1276 }
1277
1278 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1279    while (cond)
1280      body;
1281
1282    ==>
1283
1284    if (!cond) goto end;
1285    body;
1286    if (!cond) goto end;
1287    body;
1288    while (cond)
1289      body;
1290    end: ;
1291    */
1292 static void
1293 peel_loop_simple (struct loop *loop)
1294 {
1295   sbitmap wont_exit;
1296   unsigned npeel = loop->lpt_decision.times;
1297   struct niter_desc *desc = get_simple_loop_desc (loop);
1298   struct opt_info *opt_info = NULL;
1299   bool ok;
1300
1301   if (flag_split_ivs_in_unroller && npeel > 1)
1302     opt_info = analyze_insns_in_loop (loop);
1303
1304   wont_exit = sbitmap_alloc (npeel + 1);
1305   sbitmap_zero (wont_exit);
1306
1307   opt_info_start_duplication (opt_info);
1308
1309   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1310                                       npeel, wont_exit, NULL,
1311                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1312                                       | (opt_info
1313                                          ? DLTHE_RECORD_COPY_NUMBER
1314                                            : 0));
1315   gcc_assert (ok);
1316
1317   free (wont_exit);
1318
1319   if (opt_info)
1320     {
1321       apply_opt_in_copies (opt_info, npeel, false, false);
1322       free_opt_info (opt_info);
1323     }
1324
1325   if (desc->simple_p)
1326     {
1327       if (desc->const_iter)
1328         {
1329           desc->niter -= npeel;
1330           desc->niter_expr = GEN_INT (desc->niter);
1331           desc->noloop_assumptions = NULL_RTX;
1332         }
1333       else
1334         {
1335           /* We cannot just update niter_expr, as its value might be clobbered
1336              inside loop.  We could handle this by counting the number into
1337              temporary just like we do in runtime unrolling, but it does not
1338              seem worthwhile.  */
1339           free_simple_loop_desc (loop);
1340         }
1341     }
1342   if (dump_file)
1343     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1344 }
1345
1346 /* Decide whether to unroll LOOP stupidly and how much.  */
1347 static void
1348 decide_unroll_stupid (struct loop *loop, int flags)
1349 {
1350   unsigned nunroll, nunroll_by_av, i;
1351   struct niter_desc *desc;
1352
1353   if (!(flags & UAP_UNROLL_ALL))
1354     {
1355       /* We were not asked to, just return back silently.  */
1356       return;
1357     }
1358
1359   if (dump_file)
1360     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1361
1362   /* nunroll = total number of copies of the original loop body in
1363      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1364   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1365   nunroll_by_av
1366     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1367   if (nunroll > nunroll_by_av)
1368     nunroll = nunroll_by_av;
1369   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1370     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1371
1372   if (targetm.loop_unroll_adjust)
1373     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1374
1375   /* Skip big loops.  */
1376   if (nunroll <= 1)
1377     {
1378       if (dump_file)
1379         fprintf (dump_file, ";; Not considering loop, is too big\n");
1380       return;
1381     }
1382
1383   /* Check for simple loops.  */
1384   desc = get_simple_loop_desc (loop);
1385
1386   /* Check simpleness.  */
1387   if (desc->simple_p && !desc->assumptions)
1388     {
1389       if (dump_file)
1390         fprintf (dump_file, ";; The loop is simple\n");
1391       return;
1392     }
1393
1394   /* Do not unroll loops with branches inside -- it increases number
1395      of mispredicts.  */
1396   if (num_loop_branches (loop) > 1)
1397     {
1398       if (dump_file)
1399         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1400       return;
1401     }
1402
1403   /* If we have profile feedback, check whether the loop rolls.  */
1404   if ((loop->header->count
1405        && expected_loop_iterations (loop) < 2 * nunroll)
1406       || desc->niter_max < 2 * nunroll)
1407     {
1408       if (dump_file)
1409         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1410       return;
1411     }
1412
1413   /* Success.  Now force nunroll to be power of 2, as it seems that this
1414      improves results (partially because of better alignments, partially
1415      because of some dark magic).  */
1416   for (i = 1; 2 * i <= nunroll; i *= 2)
1417     continue;
1418
1419   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1420   loop->lpt_decision.times = i - 1;
1421
1422   if (dump_file)
1423     fprintf (dump_file,
1424              ";; Decided to unroll the loop stupidly, %d times.\n",
1425              loop->lpt_decision.times);
1426 }
1427
1428 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1429    while (cond)
1430      body;
1431
1432    ==>
1433
1434    while (cond)
1435      {
1436        body;
1437        if (!cond) break;
1438        body;
1439        if (!cond) break;
1440        body;
1441        if (!cond) break;
1442        body;
1443      }
1444    */
1445 static void
1446 unroll_loop_stupid (struct loop *loop)
1447 {
1448   sbitmap wont_exit;
1449   unsigned nunroll = loop->lpt_decision.times;
1450   struct niter_desc *desc = get_simple_loop_desc (loop);
1451   struct opt_info *opt_info = NULL;
1452   bool ok;
1453
1454   if (flag_split_ivs_in_unroller
1455       || flag_variable_expansion_in_unroller)
1456     opt_info = analyze_insns_in_loop (loop);
1457
1458
1459   wont_exit = sbitmap_alloc (nunroll + 1);
1460   sbitmap_zero (wont_exit);
1461   opt_info_start_duplication (opt_info);
1462
1463   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1464                                       nunroll, wont_exit,
1465                                       NULL, NULL,
1466                                       DLTHE_FLAG_UPDATE_FREQ
1467                                       | (opt_info
1468                                          ? DLTHE_RECORD_COPY_NUMBER
1469                                            : 0));
1470   gcc_assert (ok);
1471
1472   if (opt_info)
1473     {
1474       apply_opt_in_copies (opt_info, nunroll, true, true);
1475       free_opt_info (opt_info);
1476     }
1477
1478   free (wont_exit);
1479
1480   if (desc->simple_p)
1481     {
1482       /* We indeed may get here provided that there are nontrivial assumptions
1483          for a loop to be really simple.  We could update the counts, but the
1484          problem is that we are unable to decide which exit will be taken
1485          (not really true in case the number of iterations is constant,
1486          but noone will do anything with this information, so we do not
1487          worry about it).  */
1488       desc->simple_p = false;
1489     }
1490
1491   if (dump_file)
1492     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1493              nunroll, num_loop_insns (loop));
1494 }
1495
1496 /* A hash function for information about insns to split.  */
1497
1498 static hashval_t
1499 si_info_hash (const void *ivts)
1500 {
1501   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1502 }
1503
1504 /* An equality functions for information about insns to split.  */
1505
1506 static int
1507 si_info_eq (const void *ivts1, const void *ivts2)
1508 {
1509   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1510   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1511
1512   return i1->insn == i2->insn;
1513 }
1514
1515 /* Return a hash for VES, which is really a "var_to_expand *".  */
1516
1517 static hashval_t
1518 ve_info_hash (const void *ves)
1519 {
1520   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1521 }
1522
1523 /* Return true if IVTS1 and IVTS2 (which are really both of type
1524    "var_to_expand *") refer to the same instruction.  */
1525
1526 static int
1527 ve_info_eq (const void *ivts1, const void *ivts2)
1528 {
1529   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1530   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1531
1532   return i1->insn == i2->insn;
1533 }
1534
1535 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1536    Set *DEBUG_USES to the number of debug insns that reference the
1537    variable.  */
1538
1539 bool
1540 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1541                                   int *debug_uses)
1542 {
1543   basic_block *body, bb;
1544   unsigned i;
1545   int count_ref = 0;
1546   rtx insn;
1547
1548   body = get_loop_body (loop);
1549   for (i = 0; i < loop->num_nodes; i++)
1550     {
1551       bb = body[i];
1552
1553       FOR_BB_INSNS (bb, insn)
1554         if (!rtx_referenced_p (reg, insn))
1555           continue;
1556         else if (DEBUG_INSN_P (insn))
1557           ++*debug_uses;
1558         else if (++count_ref > 1)
1559           break;
1560     }
1561   free (body);
1562   return (count_ref  == 1);
1563 }
1564
1565 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1566
1567 static void
1568 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1569 {
1570   basic_block *body, bb;
1571   unsigned i;
1572   rtx insn;
1573
1574   body = get_loop_body (loop);
1575   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1576     {
1577       bb = body[i];
1578
1579       FOR_BB_INSNS (bb, insn)
1580         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1581           continue;
1582         else
1583           {
1584             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1585                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1586             if (!--debug_uses)
1587               break;
1588           }
1589     }
1590   free (body);
1591 }
1592
1593 /* Determine whether INSN contains an accumulator
1594    which can be expanded into separate copies,
1595    one for each copy of the LOOP body.
1596
1597    for (i = 0 ; i < n; i++)
1598      sum += a[i];
1599
1600    ==>
1601
1602    sum += a[i]
1603    ....
1604    i = i+1;
1605    sum1 += a[i]
1606    ....
1607    i = i+1
1608    sum2 += a[i];
1609    ....
1610
1611    Return NULL if INSN contains no opportunity for expansion of accumulator.
1612    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1613    information and return a pointer to it.
1614 */
1615
1616 static struct var_to_expand *
1617 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1618 {
1619   rtx set, dest, src;
1620   struct var_to_expand *ves;
1621   unsigned accum_pos;
1622   enum rtx_code code;
1623   int debug_uses = 0;
1624
1625   set = single_set (insn);
1626   if (!set)
1627     return NULL;
1628
1629   dest = SET_DEST (set);
1630   src = SET_SRC (set);
1631   code = GET_CODE (src);
1632
1633   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1634     return NULL;
1635
1636   if (FLOAT_MODE_P (GET_MODE (dest)))
1637     {
1638       if (!flag_associative_math)
1639         return NULL;
1640       /* In the case of FMA, we're also changing the rounding.  */
1641       if (code == FMA && !flag_unsafe_math_optimizations)
1642         return NULL;
1643     }
1644
1645   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1646      in MD.  But if there is no optab to generate the insn, we can not
1647      perform the variable expansion.  This can happen if an MD provides
1648      an insn but not a named pattern to generate it, for example to avoid
1649      producing code that needs additional mode switches like for x87/mmx.
1650
1651      So we check have_insn_for which looks for an optab for the operation
1652      in SRC.  If it doesn't exist, we can't perform the expansion even
1653      though INSN is valid.  */
1654   if (!have_insn_for (code, GET_MODE (src)))
1655     return NULL;
1656
1657   if (!REG_P (dest)
1658       && !(GET_CODE (dest) == SUBREG
1659            && REG_P (SUBREG_REG (dest))))
1660     return NULL;
1661
1662   /* Find the accumulator use within the operation.  */
1663   if (code == FMA)
1664     {
1665       /* We only support accumulation via FMA in the ADD position.  */
1666       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1667         return NULL;
1668       accum_pos = 2;
1669     }
1670   else if (rtx_equal_p (dest, XEXP (src, 0)))
1671     accum_pos = 0;
1672   else if (rtx_equal_p (dest, XEXP (src, 1)))
1673     {
1674       /* The method of expansion that we are using; which includes the
1675          initialization of the expansions with zero and the summation of
1676          the expansions at the end of the computation will yield wrong
1677          results for (x = something - x) thus avoid using it in that case.  */
1678       if (code == MINUS)
1679         return NULL;
1680       accum_pos = 1;
1681     }
1682   else
1683     return NULL;
1684
1685   /* It must not otherwise be used.  */
1686   if (code == FMA)
1687     {
1688       if (rtx_referenced_p (dest, XEXP (src, 0))
1689           || rtx_referenced_p (dest, XEXP (src, 1)))
1690         return NULL;
1691     }
1692   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1693     return NULL;
1694
1695   /* It must be used in exactly one insn.  */
1696   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1697     return NULL;
1698
1699   if (dump_file)
1700     {
1701       fprintf (dump_file, "\n;; Expanding Accumulator ");
1702       print_rtl (dump_file, dest);
1703       fprintf (dump_file, "\n");
1704     }
1705
1706   if (debug_uses)
1707     /* Instead of resetting the debug insns, we could replace each
1708        debug use in the loop with the sum or product of all expanded
1709        accummulators.  Since we'll only know of all expansions at the
1710        end, we'd have to keep track of which vars_to_expand a debug
1711        insn in the loop references, take note of each copy of the
1712        debug insn during unrolling, and when it's all done, compute
1713        the sum or product of each variable and adjust the original
1714        debug insn and each copy thereof.  What a pain!  */
1715     reset_debug_uses_in_loop (loop, dest, debug_uses);
1716
1717   /* Record the accumulator to expand.  */
1718   ves = XNEW (struct var_to_expand);
1719   ves->insn = insn;
1720   ves->reg = copy_rtx (dest);
1721   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1722   ves->next = NULL;
1723   ves->op = GET_CODE (src);
1724   ves->expansion_count = 0;
1725   ves->reuse_expansion = 0;
1726   ves->accum_pos = accum_pos;
1727   return ves;
1728 }
1729
1730 /* Determine whether there is an induction variable in INSN that
1731    we would like to split during unrolling.
1732
1733    I.e. replace
1734
1735    i = i + 1;
1736    ...
1737    i = i + 1;
1738    ...
1739    i = i + 1;
1740    ...
1741
1742    type chains by
1743
1744    i0 = i + 1
1745    ...
1746    i = i0 + 1
1747    ...
1748    i = i0 + 2
1749    ...
1750
1751    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1752    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1753    pointer to it.  */
1754
1755 static struct iv_to_split *
1756 analyze_iv_to_split_insn (rtx insn)
1757 {
1758   rtx set, dest;
1759   struct rtx_iv iv;
1760   struct iv_to_split *ivts;
1761   bool ok;
1762
1763   /* For now we just split the basic induction variables.  Later this may be
1764      extended for example by selecting also addresses of memory references.  */
1765   set = single_set (insn);
1766   if (!set)
1767     return NULL;
1768
1769   dest = SET_DEST (set);
1770   if (!REG_P (dest))
1771     return NULL;
1772
1773   if (!biv_p (insn, dest))
1774     return NULL;
1775
1776   ok = iv_analyze_result (insn, dest, &iv);
1777
1778   /* This used to be an assert under the assumption that if biv_p returns
1779      true that iv_analyze_result must also return true.  However, that
1780      assumption is not strictly correct as evidenced by pr25569.
1781
1782      Returning NULL when iv_analyze_result returns false is safe and
1783      avoids the problems in pr25569 until the iv_analyze_* routines
1784      can be fixed, which is apparently hard and time consuming
1785      according to their author.  */
1786   if (! ok)
1787     return NULL;
1788
1789   if (iv.step == const0_rtx
1790       || iv.mode != iv.extend_mode)
1791     return NULL;
1792
1793   /* Record the insn to split.  */
1794   ivts = XNEW (struct iv_to_split);
1795   ivts->insn = insn;
1796   ivts->base_var = NULL_RTX;
1797   ivts->step = iv.step;
1798   ivts->next = NULL;
1799   ivts->n_loc = 1;
1800   ivts->loc[0] = 1;
1801
1802   return ivts;
1803 }
1804
1805 /* Determines which of insns in LOOP can be optimized.
1806    Return a OPT_INFO struct with the relevant hash tables filled
1807    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1808    is undefined for the return value.  */
1809
1810 static struct opt_info *
1811 analyze_insns_in_loop (struct loop *loop)
1812 {
1813   basic_block *body, bb;
1814   unsigned i;
1815   struct opt_info *opt_info = XCNEW (struct opt_info);
1816   rtx insn;
1817   struct iv_to_split *ivts = NULL;
1818   struct var_to_expand *ves = NULL;
1819   PTR *slot1;
1820   PTR *slot2;
1821   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1822   edge exit;
1823   bool can_apply = false;
1824
1825   iv_analysis_loop_init (loop);
1826
1827   body = get_loop_body (loop);
1828
1829   if (flag_split_ivs_in_unroller)
1830     {
1831       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1832                                               si_info_hash, si_info_eq, free);
1833       opt_info->iv_to_split_head = NULL;
1834       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1835     }
1836
1837   /* Record the loop exit bb and loop preheader before the unrolling.  */
1838   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1839
1840   if (VEC_length (edge, edges) == 1)
1841     {
1842       exit = VEC_index (edge, edges, 0);
1843       if (!(exit->flags & EDGE_COMPLEX))
1844         {
1845           opt_info->loop_exit = split_edge (exit);
1846           can_apply = true;
1847         }
1848     }
1849
1850   if (flag_variable_expansion_in_unroller
1851       && can_apply)
1852     {
1853       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1854                                                         ve_info_hash,
1855                                                         ve_info_eq, free);
1856       opt_info->var_to_expand_head = NULL;
1857       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1858     }
1859
1860   for (i = 0; i < loop->num_nodes; i++)
1861     {
1862       bb = body[i];
1863       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1864         continue;
1865
1866       FOR_BB_INSNS (bb, insn)
1867       {
1868         if (!INSN_P (insn))
1869           continue;
1870
1871         if (opt_info->insns_to_split)
1872           ivts = analyze_iv_to_split_insn (insn);
1873
1874         if (ivts)
1875           {
1876             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1877             gcc_assert (*slot1 == NULL);
1878             *slot1 = ivts;
1879             *opt_info->iv_to_split_tail = ivts;
1880             opt_info->iv_to_split_tail = &ivts->next;
1881             continue;
1882           }
1883
1884         if (opt_info->insns_with_var_to_expand)
1885           ves = analyze_insn_to_expand_var (loop, insn);
1886
1887         if (ves)
1888           {
1889             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1890             gcc_assert (*slot2 == NULL);
1891             *slot2 = ves;
1892             *opt_info->var_to_expand_tail = ves;
1893             opt_info->var_to_expand_tail = &ves->next;
1894           }
1895       }
1896     }
1897
1898   VEC_free (edge, heap, edges);
1899   free (body);
1900   return opt_info;
1901 }
1902
1903 /* Called just before loop duplication.  Records start of duplicated area
1904    to OPT_INFO.  */
1905
1906 static void
1907 opt_info_start_duplication (struct opt_info *opt_info)
1908 {
1909   if (opt_info)
1910     opt_info->first_new_block = last_basic_block;
1911 }
1912
1913 /* Determine the number of iterations between initialization of the base
1914    variable and the current copy (N_COPY).  N_COPIES is the total number
1915    of newly created copies.  UNROLLING is true if we are unrolling
1916    (not peeling) the loop.  */
1917
1918 static unsigned
1919 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1920 {
1921   if (unrolling)
1922     {
1923       /* If we are unrolling, initialization is done in the original loop
1924          body (number 0).  */
1925       return n_copy;
1926     }
1927   else
1928     {
1929       /* If we are peeling, the copy in that the initialization occurs has
1930          number 1.  The original loop (number 0) is the last.  */
1931       if (n_copy)
1932         return n_copy - 1;
1933       else
1934         return n_copies;
1935     }
1936 }
1937
1938 /* Locate in EXPR the expression corresponding to the location recorded
1939    in IVTS, and return a pointer to the RTX for this location.  */
1940
1941 static rtx *
1942 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1943 {
1944   unsigned i;
1945   rtx *ret = &expr;
1946
1947   for (i = 0; i < ivts->n_loc; i++)
1948     ret = &XEXP (*ret, ivts->loc[i]);
1949
1950   return ret;
1951 }
1952
1953 /* Allocate basic variable for the induction variable chain.  */
1954
1955 static void
1956 allocate_basic_variable (struct iv_to_split *ivts)
1957 {
1958   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1959
1960   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1961 }
1962
1963 /* Insert initialization of basic variable of IVTS before INSN, taking
1964    the initial value from INSN.  */
1965
1966 static void
1967 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1968 {
1969   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1970   rtx seq;
1971
1972   start_sequence ();
1973   expr = force_operand (expr, ivts->base_var);
1974   if (expr != ivts->base_var)
1975     emit_move_insn (ivts->base_var, expr);
1976   seq = get_insns ();
1977   end_sequence ();
1978
1979   emit_insn_before (seq, insn);
1980 }
1981
1982 /* Replace the use of induction variable described in IVTS in INSN
1983    by base variable + DELTA * step.  */
1984
1985 static void
1986 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1987 {
1988   rtx expr, *loc, seq, incr, var;
1989   enum machine_mode mode = GET_MODE (ivts->base_var);
1990   rtx src, dest, set;
1991
1992   /* Construct base + DELTA * step.  */
1993   if (!delta)
1994     expr = ivts->base_var;
1995   else
1996     {
1997       incr = simplify_gen_binary (MULT, mode,
1998                                   ivts->step, gen_int_mode (delta, mode));
1999       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2000                                   ivts->base_var, incr);
2001     }
2002
2003   /* Figure out where to do the replacement.  */
2004   loc = get_ivts_expr (single_set (insn), ivts);
2005
2006   /* If we can make the replacement right away, we're done.  */
2007   if (validate_change (insn, loc, expr, 0))
2008     return;
2009
2010   /* Otherwise, force EXPR into a register and try again.  */
2011   start_sequence ();
2012   var = gen_reg_rtx (mode);
2013   expr = force_operand (expr, var);
2014   if (expr != var)
2015     emit_move_insn (var, expr);
2016   seq = get_insns ();
2017   end_sequence ();
2018   emit_insn_before (seq, insn);
2019
2020   if (validate_change (insn, loc, var, 0))
2021     return;
2022
2023   /* The last chance.  Try recreating the assignment in insn
2024      completely from scratch.  */
2025   set = single_set (insn);
2026   gcc_assert (set);
2027
2028   start_sequence ();
2029   *loc = var;
2030   src = copy_rtx (SET_SRC (set));
2031   dest = copy_rtx (SET_DEST (set));
2032   src = force_operand (src, dest);
2033   if (src != dest)
2034     emit_move_insn (dest, src);
2035   seq = get_insns ();
2036   end_sequence ();
2037
2038   emit_insn_before (seq, insn);
2039   delete_insn (insn);
2040 }
2041
2042
2043 /* Return one expansion of the accumulator recorded in struct VE.  */
2044
2045 static rtx
2046 get_expansion (struct var_to_expand *ve)
2047 {
2048   rtx reg;
2049
2050   if (ve->reuse_expansion == 0)
2051     reg = ve->reg;
2052   else
2053     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2054
2055   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2056     ve->reuse_expansion = 0;
2057   else
2058     ve->reuse_expansion++;
2059
2060   return reg;
2061 }
2062
2063
2064 /* Given INSN replace the uses of the accumulator recorded in VE
2065    with a new register.  */
2066
2067 static void
2068 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2069 {
2070   rtx new_reg, set;
2071   bool really_new_expansion = false;
2072
2073   set = single_set (insn);
2074   gcc_assert (set);
2075
2076   /* Generate a new register only if the expansion limit has not been
2077      reached.  Else reuse an already existing expansion.  */
2078   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2079     {
2080       really_new_expansion = true;
2081       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2082     }
2083   else
2084     new_reg = get_expansion (ve);
2085
2086   validate_change (insn, &SET_DEST (set), new_reg, 1);
2087   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2088
2089   if (apply_change_group ())
2090     if (really_new_expansion)
2091       {
2092         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2093         ve->expansion_count++;
2094       }
2095 }
2096
2097 /* Initialize the variable expansions in loop preheader.  PLACE is the
2098    loop-preheader basic block where the initialization of the
2099    expansions should take place.  The expansions are initialized with
2100    (-0) when the operation is plus or minus to honor sign zero.  This
2101    way we can prevent cases where the sign of the final result is
2102    effected by the sign of the expansion.  Here is an example to
2103    demonstrate this:
2104
2105    for (i = 0 ; i < n; i++)
2106      sum += something;
2107
2108    ==>
2109
2110    sum += something
2111    ....
2112    i = i+1;
2113    sum1 += something
2114    ....
2115    i = i+1
2116    sum2 += something;
2117    ....
2118
2119    When SUM is initialized with -zero and SOMETHING is also -zero; the
2120    final result of sum should be -zero thus the expansions sum1 and sum2
2121    should be initialized with -zero as well (otherwise we will get +zero
2122    as the final result).  */
2123
2124 static void
2125 insert_var_expansion_initialization (struct var_to_expand *ve,
2126                                      basic_block place)
2127 {
2128   rtx seq, var, zero_init, insn;
2129   unsigned i;
2130   enum machine_mode mode = GET_MODE (ve->reg);
2131   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2132
2133   if (VEC_length (rtx, ve->var_expansions) == 0)
2134     return;
2135
2136   start_sequence ();
2137   switch (ve->op)
2138     {
2139     case FMA:
2140       /* Note that we only accumulate FMA via the ADD operand.  */
2141     case PLUS:
2142     case MINUS:
2143       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2144         {
2145           if (honor_signed_zero_p)
2146             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2147           else
2148             zero_init = CONST0_RTX (mode);
2149           emit_move_insn (var, zero_init);
2150         }
2151       break;
2152
2153     case MULT:
2154       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2155         {
2156           zero_init = CONST1_RTX (GET_MODE (var));
2157           emit_move_insn (var, zero_init);
2158         }
2159       break;
2160
2161     default:
2162       gcc_unreachable ();
2163     }
2164
2165   seq = get_insns ();
2166   end_sequence ();
2167
2168   insn = BB_HEAD (place);
2169   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2170     insn = NEXT_INSN (insn);
2171
2172   emit_insn_after (seq, insn);
2173 }
2174
2175 /* Combine the variable expansions at the loop exit.  PLACE is the
2176    loop exit basic block where the summation of the expansions should
2177    take place.  */
2178
2179 static void
2180 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2181 {
2182   rtx sum = ve->reg;
2183   rtx expr, seq, var, insn;
2184   unsigned i;
2185
2186   if (VEC_length (rtx, ve->var_expansions) == 0)
2187     return;
2188
2189   start_sequence ();
2190   switch (ve->op)
2191     {
2192     case FMA:
2193       /* Note that we only accumulate FMA via the ADD operand.  */
2194     case PLUS:
2195     case MINUS:
2196       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2197         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2198       break;
2199
2200     case MULT:
2201       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2202         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2203       break;
2204
2205     default:
2206       gcc_unreachable ();
2207     }
2208
2209   expr = force_operand (sum, ve->reg);
2210   if (expr != ve->reg)
2211     emit_move_insn (ve->reg, expr);
2212   seq = get_insns ();
2213   end_sequence ();
2214
2215   insn = BB_HEAD (place);
2216   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2217     insn = NEXT_INSN (insn);
2218
2219   emit_insn_after (seq, insn);
2220 }
2221
2222 /* Apply loop optimizations in loop copies using the
2223    data which gathered during the unrolling.  Structure
2224    OPT_INFO record that data.
2225
2226    UNROLLING is true if we unrolled (not peeled) the loop.
2227    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2228    the loop (as it should happen in complete unrolling, but not in ordinary
2229    peeling of the loop).  */
2230
2231 static void
2232 apply_opt_in_copies (struct opt_info *opt_info,
2233                      unsigned n_copies, bool unrolling,
2234                      bool rewrite_original_loop)
2235 {
2236   unsigned i, delta;
2237   basic_block bb, orig_bb;
2238   rtx insn, orig_insn, next;
2239   struct iv_to_split ivts_templ, *ivts;
2240   struct var_to_expand ve_templ, *ves;
2241
2242   /* Sanity check -- we need to put initialization in the original loop
2243      body.  */
2244   gcc_assert (!unrolling || rewrite_original_loop);
2245
2246   /* Allocate the basic variables (i0).  */
2247   if (opt_info->insns_to_split)
2248     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2249       allocate_basic_variable (ivts);
2250
2251   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2252     {
2253       bb = BASIC_BLOCK (i);
2254       orig_bb = get_bb_original (bb);
2255
2256       /* bb->aux holds position in copy sequence initialized by
2257          duplicate_loop_to_header_edge.  */
2258       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2259                                         unrolling);
2260       bb->aux = 0;
2261       orig_insn = BB_HEAD (orig_bb);
2262       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2263         {
2264           next = NEXT_INSN (insn);
2265           if (!INSN_P (insn)
2266               || (DEBUG_INSN_P (insn)
2267                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2268             continue;
2269
2270           while (!INSN_P (orig_insn)
2271                  || (DEBUG_INSN_P (orig_insn)
2272                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2273                          == LABEL_DECL)))
2274             orig_insn = NEXT_INSN (orig_insn);
2275
2276           ivts_templ.insn = orig_insn;
2277           ve_templ.insn = orig_insn;
2278
2279           /* Apply splitting iv optimization.  */
2280           if (opt_info->insns_to_split)
2281             {
2282               ivts = (struct iv_to_split *)
2283                 htab_find (opt_info->insns_to_split, &ivts_templ);
2284
2285               if (ivts)
2286                 {
2287                   gcc_assert (GET_CODE (PATTERN (insn))
2288                               == GET_CODE (PATTERN (orig_insn)));
2289
2290                   if (!delta)
2291                     insert_base_initialization (ivts, insn);
2292                   split_iv (ivts, insn, delta);
2293                 }
2294             }
2295           /* Apply variable expansion optimization.  */
2296           if (unrolling && opt_info->insns_with_var_to_expand)
2297             {
2298               ves = (struct var_to_expand *)
2299                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2300               if (ves)
2301                 {
2302                   gcc_assert (GET_CODE (PATTERN (insn))
2303                               == GET_CODE (PATTERN (orig_insn)));
2304                   expand_var_during_unrolling (ves, insn);
2305                 }
2306             }
2307           orig_insn = NEXT_INSN (orig_insn);
2308         }
2309     }
2310
2311   if (!rewrite_original_loop)
2312     return;
2313
2314   /* Initialize the variable expansions in the loop preheader
2315      and take care of combining them at the loop exit.  */
2316   if (opt_info->insns_with_var_to_expand)
2317     {
2318       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2319         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2320       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2321         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2322     }
2323
2324   /* Rewrite also the original loop body.  Find them as originals of the blocks
2325      in the last copied iteration, i.e. those that have
2326      get_bb_copy (get_bb_original (bb)) == bb.  */
2327   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2328     {
2329       bb = BASIC_BLOCK (i);
2330       orig_bb = get_bb_original (bb);
2331       if (get_bb_copy (orig_bb) != bb)
2332         continue;
2333
2334       delta = determine_split_iv_delta (0, n_copies, unrolling);
2335       for (orig_insn = BB_HEAD (orig_bb);
2336            orig_insn != NEXT_INSN (BB_END (bb));
2337            orig_insn = next)
2338         {
2339           next = NEXT_INSN (orig_insn);
2340
2341           if (!INSN_P (orig_insn))
2342             continue;
2343
2344           ivts_templ.insn = orig_insn;
2345           if (opt_info->insns_to_split)
2346             {
2347               ivts = (struct iv_to_split *)
2348                 htab_find (opt_info->insns_to_split, &ivts_templ);
2349               if (ivts)
2350                 {
2351                   if (!delta)
2352                     insert_base_initialization (ivts, orig_insn);
2353                   split_iv (ivts, orig_insn, delta);
2354                   continue;
2355                 }
2356             }
2357
2358         }
2359     }
2360 }
2361
2362 /* Release OPT_INFO.  */
2363
2364 static void
2365 free_opt_info (struct opt_info *opt_info)
2366 {
2367   if (opt_info->insns_to_split)
2368     htab_delete (opt_info->insns_to_split);
2369   if (opt_info->insns_with_var_to_expand)
2370     {
2371       struct var_to_expand *ves;
2372
2373       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2374         VEC_free (rtx, heap, ves->var_expansions);
2375       htab_delete (opt_info->insns_with_var_to_expand);
2376     }
2377   free (opt_info);
2378 }