gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36 #include "target.h"
  37
  38 /* This pass performs loop unrolling and peeling.  We only perform these
  39    optimizations on innermost loops (with single exception) because
  40    the impact on performance is greatest here, and we want to avoid
  41    unnecessary code size growth.  The gain is caused by greater sequentiality
  42    of code, better code to optimize for further passes and in some cases
  43    by fewer testings of exit conditions.  The main problem is code growth,
  44    that impacts performance negatively due to effect of caches.
  45
  46    What we do:
  47
  48    -- complete peeling of once-rolling loops; this is the above mentioned
  49       exception, as this causes loop to be cancelled completely and
  50       does not cause code growth
  51    -- complete peeling of loops that roll (small) constant times.
  52    -- simple peeling of first iterations of loops that do not roll much
  53       (according to profile feedback)
  54    -- unrolling of loops that roll constant times; this is almost always
  55       win, as we get rid of exit condition tests.
  56    -- unrolling of loops that roll number of times that we can compute
  57       in runtime; we also get rid of exit condition tests here, but there
  58       is the extra expense for calculating the number of iterations
  59    -- simple unrolling of remaining loops; this is performed only if we
  60       are asked to, as the gain is questionable in this case and often
  61       it may even slow down the code
  62    For more detailed descriptions of each of those, see comments at
  63    appropriate function below.
  64
  65    There is a lot of parameters (defined and described in params.def) that
  66    control how much we unroll/peel.
  67
  68    ??? A great problem is that we don't have a good way how to determine
  69    how many times we should unroll the loop; the experiments I have made
  70    showed that this choice may affect performance in order of several %.
  71    */
  72
  73 /* Information about induction variables to split.  */
  74
  75 struct iv_to_split
  76 {
  77   rtx insn;             /* The insn in that the induction variable occurs.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104   unsigned accum_pos;              /* The position in which the accumulator is placed in
 105                                       the insn src.  For example in x = x + something
 106                                       accum_pos is 0 while in x = something + x accum_pos
 107                                       is 1.  */
 108 };
 109
 110 /* Information about optimization applied in
 111    the unrolled loop.  */
 112
 113 struct opt_info
 114 {
 115   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 116   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 117   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 118   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 119                                       to expand.  */
 120   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 121   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 122   unsigned first_new_block;        /* The first basic block that was
 123                                       duplicated.  */
 124   basic_block loop_exit;           /* The loop exit basic block.  */
 125   basic_block loop_preheader;      /* The loop preheader basic block.  */
 126 };
 127
 128 static void decide_unrolling_and_peeling (int);
 129 static void peel_loops_completely (int);
 130 static void decide_peel_simple (struct loop *, int);
 131 static void decide_peel_once_rolling (struct loop *, int);
 132 static void decide_peel_completely (struct loop *, int);
 133 static void decide_unroll_stupid (struct loop *, int);
 134 static void decide_unroll_constant_iterations (struct loop *, int);
 135 static void decide_unroll_runtime_iterations (struct loop *, int);
 136 static void peel_loop_simple (struct loop *);
 137 static void peel_loop_completely (struct loop *);
 138 static void unroll_loop_stupid (struct loop *);
 139 static void unroll_loop_constant_iterations (struct loop *);
 140 static void unroll_loop_runtime_iterations (struct loop *);
 141 static struct opt_info *analyze_insns_in_loop (struct loop *);
 142 static void opt_info_start_duplication (struct opt_info *);
 143 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 144 static void free_opt_info (struct opt_info *);
 145 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 146 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 147 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 148 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 149 static void insert_var_expansion_initialization (struct var_to_expand *,
 150                                                  basic_block);
 151 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 152                                              basic_block);
 153 static rtx get_expansion (struct var_to_expand *);
 154
 155 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 156 void
 157 unroll_and_peel_loops (int flags)
 158 {
 159   struct loop *loop;
 160   bool check;
 161   loop_iterator li;
 162
 163   /* First perform complete loop peeling (it is almost surely a win,
 164      and affects parameters for further decision a lot).  */
 165   peel_loops_completely (flags);
 166
 167   /* Now decide rest of unrolling and peeling.  */
 168   decide_unrolling_and_peeling (flags);
 169
 170   /* Scan the loops, inner ones first.  */
 171   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 172     {
 173       check = true;
 174       /* And perform the appropriate transformations.  */
 175       switch (loop->lpt_decision.decision)
 176         {
 177         case LPT_PEEL_COMPLETELY:
 178           /* Already done.  */
 179           gcc_unreachable ();
 180         case LPT_PEEL_SIMPLE:
 181           peel_loop_simple (loop);
 182           break;
 183         case LPT_UNROLL_CONSTANT:
 184           unroll_loop_constant_iterations (loop);
 185           break;
 186         case LPT_UNROLL_RUNTIME:
 187           unroll_loop_runtime_iterations (loop);
 188           break;
 189         case LPT_UNROLL_STUPID:
 190           unroll_loop_stupid (loop);
 191           break;
 192         case LPT_NONE:
 193           check = false;
 194           break;
 195         default:
 196           gcc_unreachable ();
 197         }
 198       if (check)
 199         {
 200 #ifdef ENABLE_CHECKING
 201           verify_dominators (CDI_DOMINATORS);
 202           verify_loop_structure ();
 203 #endif
 204         }
 205     }
 206
 207   iv_analysis_done ();
 208 }
 209
 210 /* Check whether exit of the LOOP is at the end of loop body.  */
 211
 212 static bool
 213 loop_exit_at_end_p (struct loop *loop)
 214 {
 215   struct niter_desc *desc = get_simple_loop_desc (loop);
 216   rtx insn;
 217
 218   if (desc->in_edge->dest != loop->latch)
 219     return false;
 220
 221   /* Check that the latch is empty.  */
 222   FOR_BB_INSNS (loop->latch, insn)
 223     {
 224       if (INSN_P (insn))
 225         return false;
 226     }
 227
 228   return true;
 229 }
 230
 231 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 232 static void
 233 peel_loops_completely (int flags)
 234 {
 235   struct loop *loop;
 236   loop_iterator li;
 237
 238   /* Scan the loops, the inner ones first.  */
 239   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 240     {
 241       loop->lpt_decision.decision = LPT_NONE;
 242
 243       if (dump_file)
 244         fprintf (dump_file,
 245                  "\n;; *** Considering loop %d for complete peeling ***\n",
 246                  loop->num);
 247
 248       loop->ninsns = num_loop_insns (loop);
 249
 250       decide_peel_once_rolling (loop, flags);
 251       if (loop->lpt_decision.decision == LPT_NONE)
 252         decide_peel_completely (loop, flags);
 253
 254       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255         {
 256           peel_loop_completely (loop);
 257 #ifdef ENABLE_CHECKING
 258           verify_dominators (CDI_DOMINATORS);
 259           verify_loop_structure ();
 260 #endif
 261         }
 262     }
 263 }
 264
 265 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 266 static void
 267 decide_unrolling_and_peeling (int flags)
 268 {
 269   struct loop *loop;
 270   loop_iterator li;
 271
 272   /* Scan the loops, inner ones first.  */
 273   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 274     {
 275       loop->lpt_decision.decision = LPT_NONE;
 276
 277       if (dump_file)
 278         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 279
 280       /* Do not peel cold areas.  */
 281       if (optimize_loop_for_size_p (loop))
 282         {
 283           if (dump_file)
 284             fprintf (dump_file, ";; Not considering loop, cold area\n");
 285           continue;
 286         }
 287
 288       /* Can the loop be manipulated?  */
 289       if (!can_duplicate_loop_p (loop))
 290         {
 291           if (dump_file)
 292             fprintf (dump_file,
 293                      ";; Not considering loop, cannot duplicate\n");
 294           continue;
 295         }
 296
 297       /* Skip non-innermost loops.  */
 298       if (loop->inner)
 299         {
 300           if (dump_file)
 301             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 302           continue;
 303         }
 304
 305       loop->ninsns = num_loop_insns (loop);
 306       loop->av_ninsns = average_num_loop_insns (loop);
 307
 308       /* Try transformations one by one in decreasing order of
 309          priority.  */
 310
 311       decide_unroll_constant_iterations (loop, flags);
 312       if (loop->lpt_decision.decision == LPT_NONE)
 313         decide_unroll_runtime_iterations (loop, flags);
 314       if (loop->lpt_decision.decision == LPT_NONE)
 315         decide_unroll_stupid (loop, flags);
 316       if (loop->lpt_decision.decision == LPT_NONE)
 317         decide_peel_simple (loop, flags);
 318     }
 319 }
 320
 321 /* Decide whether the LOOP is once rolling and suitable for complete
 322    peeling.  */
 323 static void
 324 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 325 {
 326   struct niter_desc *desc;
 327
 328   if (dump_file)
 329     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 330
 331   /* Is the loop small enough?  */
 332   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 333     {
 334       if (dump_file)
 335         fprintf (dump_file, ";; Not considering loop, is too big\n");
 336       return;
 337     }
 338
 339   /* Check for simple loops.  */
 340   desc = get_simple_loop_desc (loop);
 341
 342   /* Check number of iterations.  */
 343   if (!desc->simple_p
 344       || desc->assumptions
 345       || desc->infinite
 346       || !desc->const_iter
 347       || desc->niter != 0)
 348     {
 349       if (dump_file)
 350         fprintf (dump_file,
 351                  ";; Unable to prove that the loop rolls exactly once\n");
 352       return;
 353     }
 354
 355   /* Success.  */
 356   if (dump_file)
 357     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 358   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 359 }
 360
 361 /* Decide whether the LOOP is suitable for complete peeling.  */
 362 static void
 363 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 364 {
 365   unsigned npeel;
 366   struct niter_desc *desc;
 367
 368   if (dump_file)
 369     fprintf (dump_file, "\n;; Considering peeling completely\n");
 370
 371   /* Skip non-innermost loops.  */
 372   if (loop->inner)
 373     {
 374       if (dump_file)
 375         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 376       return;
 377     }
 378
 379   /* Do not peel cold areas.  */
 380   if (optimize_loop_for_size_p (loop))
 381     {
 382       if (dump_file)
 383         fprintf (dump_file, ";; Not considering loop, cold area\n");
 384       return;
 385     }
 386
 387   /* Can the loop be manipulated?  */
 388   if (!can_duplicate_loop_p (loop))
 389     {
 390       if (dump_file)
 391         fprintf (dump_file,
 392                  ";; Not considering loop, cannot duplicate\n");
 393       return;
 394     }
 395
 396   /* npeel = number of iterations to peel.  */
 397   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 398   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 399     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 400
 401   /* Is the loop small enough?  */
 402   if (!npeel)
 403     {
 404       if (dump_file)
 405         fprintf (dump_file, ";; Not considering loop, is too big\n");
 406       return;
 407     }
 408
 409   /* Check for simple loops.  */
 410   desc = get_simple_loop_desc (loop);
 411
 412   /* Check number of iterations.  */
 413   if (!desc->simple_p
 414       || desc->assumptions
 415       || !desc->const_iter
 416       || desc->infinite)
 417     {
 418       if (dump_file)
 419         fprintf (dump_file,
 420                  ";; Unable to prove that the loop iterates constant times\n");
 421       return;
 422     }
 423
 424   if (desc->niter > npeel - 1)
 425     {
 426       if (dump_file)
 427         {
 428           fprintf (dump_file,
 429                    ";; Not peeling loop completely, rolls too much (");
 430           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 431           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 432         }
 433       return;
 434     }
 435
 436   /* Success.  */
 437   if (dump_file)
 438     fprintf (dump_file, ";; Decided to peel loop completely\n");
 439   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 440 }
 441
 442 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 443    completely.  The transformation done:
 444
 445    for (i = 0; i < 4; i++)
 446      body;
 447
 448    ==>
 449
 450    i = 0;
 451    body; i++;
 452    body; i++;
 453    body; i++;
 454    body; i++;
 455    */
 456 static void
 457 peel_loop_completely (struct loop *loop)
 458 {
 459   sbitmap wont_exit;
 460   unsigned HOST_WIDE_INT npeel;
 461   unsigned i;
 462   VEC (edge, heap) *remove_edges;
 463   edge ein;
 464   struct niter_desc *desc = get_simple_loop_desc (loop);
 465   struct opt_info *opt_info = NULL;
 466
 467   npeel = desc->niter;
 468
 469   if (npeel)
 470     {
 471       bool ok;
 472
 473       wont_exit = sbitmap_alloc (npeel + 1);
 474       sbitmap_ones (wont_exit);
 475       RESET_BIT (wont_exit, 0);
 476       if (desc->noloop_assumptions)
 477         RESET_BIT (wont_exit, 1);
 478
 479       remove_edges = NULL;
 480
 481       if (flag_split_ivs_in_unroller)
 482         opt_info = analyze_insns_in_loop (loop);
 483
 484       opt_info_start_duplication (opt_info);
 485       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 486                                           npeel,
 487                                           wont_exit, desc->out_edge,
 488                                           &remove_edges,
 489                                           DLTHE_FLAG_UPDATE_FREQ
 490                                           | DLTHE_FLAG_COMPLETTE_PEEL
 491                                           | (opt_info
 492                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 493       gcc_assert (ok);
 494
 495       free (wont_exit);
 496
 497       if (opt_info)
 498         {
 499           apply_opt_in_copies (opt_info, npeel, false, true);
 500           free_opt_info (opt_info);
 501         }
 502
 503       /* Remove the exit edges.  */
 504       FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
 505         remove_path (ein);
 506       VEC_free (edge, heap, remove_edges);
 507     }
 508
 509   ein = desc->in_edge;
 510   free_simple_loop_desc (loop);
 511
 512   /* Now remove the unreachable part of the last iteration and cancel
 513      the loop.  */
 514   remove_path (ein);
 515
 516   if (dump_file)
 517     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 518 }
 519
 520 /* Decide whether to unroll LOOP iterating constant number of times
 521    and how much.  */
 522
 523 static void
 524 decide_unroll_constant_iterations (struct loop *loop, int flags)
 525 {
 526   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 527   struct niter_desc *desc;
 528
 529   if (!(flags & UAP_UNROLL))
 530     {
 531       /* We were not asked to, just return back silently.  */
 532       return;
 533     }
 534
 535   if (dump_file)
 536     fprintf (dump_file,
 537              "\n;; Considering unrolling loop with constant "
 538              "number of iterations\n");
 539
 540   /* nunroll = total number of copies of the original loop body in
 541      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 542   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 543   nunroll_by_av
 544     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 545   if (nunroll > nunroll_by_av)
 546     nunroll = nunroll_by_av;
 547   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 548     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 549
 550   /* Skip big loops.  */
 551   if (nunroll <= 1)
 552     {
 553       if (dump_file)
 554         fprintf (dump_file, ";; Not considering loop, is too big\n");
 555       return;
 556     }
 557
 558   /* Check for simple loops.  */
 559   desc = get_simple_loop_desc (loop);
 560
 561   /* Check number of iterations.  */
 562   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 563     {
 564       if (dump_file)
 565         fprintf (dump_file,
 566                  ";; Unable to prove that the loop iterates constant times\n");
 567       return;
 568     }
 569
 570   /* Check whether the loop rolls enough to consider.  */
 571   if (desc->niter < 2 * nunroll)
 572     {
 573       if (dump_file)
 574         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 575       return;
 576     }
 577
 578   /* Success; now compute number of iterations to unroll.  We alter
 579      nunroll so that as few as possible copies of loop body are
 580      necessary, while still not decreasing the number of unrollings
 581      too much (at most by 1).  */
 582   best_copies = 2 * nunroll + 10;
 583
 584   i = 2 * nunroll + 2;
 585   if (i - 1 >= desc->niter)
 586     i = desc->niter - 2;
 587
 588   for (; i >= nunroll - 1; i--)
 589     {
 590       unsigned exit_mod = desc->niter % (i + 1);
 591
 592       if (!loop_exit_at_end_p (loop))
 593         n_copies = exit_mod + i + 1;
 594       else if (exit_mod != (unsigned) i
 595                || desc->noloop_assumptions != NULL_RTX)
 596         n_copies = exit_mod + i + 2;
 597       else
 598         n_copies = i + 1;
 599
 600       if (n_copies < best_copies)
 601         {
 602           best_copies = n_copies;
 603           best_unroll = i;
 604         }
 605     }
 606
 607   if (dump_file)
 608     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 609              best_unroll + 1, best_copies, nunroll);
 610
 611   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 612   loop->lpt_decision.times = best_unroll;
 613
 614   if (dump_file)
 615     fprintf (dump_file,
 616              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 617              loop->lpt_decision.times);
 618 }
 619
 620 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 621    times.  The transformation does this:
 622
 623    for (i = 0; i < 102; i++)
 624      body;
 625
 626    ==>
 627
 628    i = 0;
 629    body; i++;
 630    body; i++;
 631    while (i < 102)
 632      {
 633        body; i++;
 634        body; i++;
 635        body; i++;
 636        body; i++;
 637      }
 638   */
 639 static void
 640 unroll_loop_constant_iterations (struct loop *loop)
 641 {
 642   unsigned HOST_WIDE_INT niter;
 643   unsigned exit_mod;
 644   sbitmap wont_exit;
 645   unsigned i;
 646   VEC (edge, heap) *remove_edges;
 647   edge e;
 648   unsigned max_unroll = loop->lpt_decision.times;
 649   struct niter_desc *desc = get_simple_loop_desc (loop);
 650   bool exit_at_end = loop_exit_at_end_p (loop);
 651   struct opt_info *opt_info = NULL;
 652   bool ok;
 653
 654   niter = desc->niter;
 655
 656   /* Should not get here (such loop should be peeled instead).  */
 657   gcc_assert (niter > max_unroll + 1);
 658
 659   exit_mod = niter % (max_unroll + 1);
 660
 661   wont_exit = sbitmap_alloc (max_unroll + 1);
 662   sbitmap_ones (wont_exit);
 663
 664   remove_edges = NULL;
 665   if (flag_split_ivs_in_unroller
 666       || flag_variable_expansion_in_unroller)
 667     opt_info = analyze_insns_in_loop (loop);
 668
 669   if (!exit_at_end)
 670     {
 671       /* The exit is not at the end of the loop; leave exit test
 672          in the first copy, so that the loops that start with test
 673          of exit condition have continuous body after unrolling.  */
 674
 675       if (dump_file)
 676         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 677
 678       /* Peel exit_mod iterations.  */
 679       RESET_BIT (wont_exit, 0);
 680       if (desc->noloop_assumptions)
 681         RESET_BIT (wont_exit, 1);
 682
 683       if (exit_mod)
 684         {
 685           opt_info_start_duplication (opt_info);
 686           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 687                                               exit_mod,
 688                                               wont_exit, desc->out_edge,
 689                                               &remove_edges,
 690                                               DLTHE_FLAG_UPDATE_FREQ
 691                                               | (opt_info && exit_mod > 1
 692                                                  ? DLTHE_RECORD_COPY_NUMBER
 693                                                    : 0));
 694           gcc_assert (ok);
 695
 696           if (opt_info && exit_mod > 1)
 697             apply_opt_in_copies (opt_info, exit_mod, false, false);
 698
 699           desc->noloop_assumptions = NULL_RTX;
 700           desc->niter -= exit_mod;
 701           desc->niter_max -= exit_mod;
 702         }
 703
 704       SET_BIT (wont_exit, 1);
 705     }
 706   else
 707     {
 708       /* Leave exit test in last copy, for the same reason as above if
 709          the loop tests the condition at the end of loop body.  */
 710
 711       if (dump_file)
 712         fprintf (dump_file, ";; Condition on end of loop.\n");
 713
 714       /* We know that niter >= max_unroll + 2; so we do not need to care of
 715          case when we would exit before reaching the loop.  So just peel
 716          exit_mod + 1 iterations.  */
 717       if (exit_mod != max_unroll
 718           || desc->noloop_assumptions)
 719         {
 720           RESET_BIT (wont_exit, 0);
 721           if (desc->noloop_assumptions)
 722             RESET_BIT (wont_exit, 1);
 723
 724           opt_info_start_duplication (opt_info);
 725           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 726                                               exit_mod + 1,
 727                                               wont_exit, desc->out_edge,
 728                                               &remove_edges,
 729                                               DLTHE_FLAG_UPDATE_FREQ
 730                                               | (opt_info && exit_mod > 0
 731                                                  ? DLTHE_RECORD_COPY_NUMBER
 732                                                    : 0));
 733           gcc_assert (ok);
 734
 735           if (opt_info && exit_mod > 0)
 736             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 737
 738           desc->niter -= exit_mod + 1;
 739           desc->niter_max -= exit_mod + 1;
 740           desc->noloop_assumptions = NULL_RTX;
 741
 742           SET_BIT (wont_exit, 0);
 743           SET_BIT (wont_exit, 1);
 744         }
 745
 746       RESET_BIT (wont_exit, max_unroll);
 747     }
 748
 749   /* Now unroll the loop.  */
 750
 751   opt_info_start_duplication (opt_info);
 752   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 753                                       max_unroll,
 754                                       wont_exit, desc->out_edge,
 755                                       &remove_edges,
 756                                       DLTHE_FLAG_UPDATE_FREQ
 757                                       | (opt_info
 758                                          ? DLTHE_RECORD_COPY_NUMBER
 759                                            : 0));
 760   gcc_assert (ok);
 761
 762   if (opt_info)
 763     {
 764       apply_opt_in_copies (opt_info, max_unroll, true, true);
 765       free_opt_info (opt_info);
 766     }
 767
 768   free (wont_exit);
 769
 770   if (exit_at_end)
 771     {
 772       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 773       /* Find a new in and out edge; they are in the last copy we have made.  */
 774
 775       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 776         {
 777           desc->out_edge = EDGE_SUCC (exit_block, 0);
 778           desc->in_edge = EDGE_SUCC (exit_block, 1);
 779         }
 780       else
 781         {
 782           desc->out_edge = EDGE_SUCC (exit_block, 1);
 783           desc->in_edge = EDGE_SUCC (exit_block, 0);
 784         }
 785     }
 786
 787   desc->niter /= max_unroll + 1;
 788   desc->niter_max /= max_unroll + 1;
 789   desc->niter_expr = GEN_INT (desc->niter);
 790
 791   /* Remove the edges.  */
 792   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
 793     remove_path (e);
 794   VEC_free (edge, heap, remove_edges);
 795
 796   if (dump_file)
 797     fprintf (dump_file,
 798              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 799              max_unroll, num_loop_insns (loop));
 800 }
 801
 802 /* Decide whether to unroll LOOP iterating runtime computable number of times
 803    and how much.  */
 804 static void
 805 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 806 {
 807   unsigned nunroll, nunroll_by_av, i;
 808   struct niter_desc *desc;
 809
 810   if (!(flags & UAP_UNROLL))
 811     {
 812       /* We were not asked to, just return back silently.  */
 813       return;
 814     }
 815
 816   if (dump_file)
 817     fprintf (dump_file,
 818              "\n;; Considering unrolling loop with runtime "
 819              "computable number of iterations\n");
 820
 821   /* nunroll = total number of copies of the original loop body in
 822      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 823   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 824   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 825   if (nunroll > nunroll_by_av)
 826     nunroll = nunroll_by_av;
 827   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 828     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 829
 830   if (targetm.loop_unroll_adjust)
 831     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 832
 833   /* Skip big loops.  */
 834   if (nunroll <= 1)
 835     {
 836       if (dump_file)
 837         fprintf (dump_file, ";; Not considering loop, is too big\n");
 838       return;
 839     }
 840
 841   /* Check for simple loops.  */
 842   desc = get_simple_loop_desc (loop);
 843
 844   /* Check simpleness.  */
 845   if (!desc->simple_p || desc->assumptions)
 846     {
 847       if (dump_file)
 848         fprintf (dump_file,
 849                  ";; Unable to prove that the number of iterations "
 850                  "can be counted in runtime\n");
 851       return;
 852     }
 853
 854   if (desc->const_iter)
 855     {
 856       if (dump_file)
 857         fprintf (dump_file, ";; Loop iterates constant times\n");
 858       return;
 859     }
 860
 861   /* If we have profile feedback, check whether the loop rolls.  */
 862   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 863     {
 864       if (dump_file)
 865         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 866       return;
 867     }
 868
 869   /* Success; now force nunroll to be power of 2, as we are unable to
 870      cope with overflows in computation of number of iterations.  */
 871   for (i = 1; 2 * i <= nunroll; i *= 2)
 872     continue;
 873
 874   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 875   loop->lpt_decision.times = i - 1;
 876
 877   if (dump_file)
 878     fprintf (dump_file,
 879              ";; Decided to unroll the runtime computable "
 880              "times rolling loop, %d times.\n",
 881              loop->lpt_decision.times);
 882 }
 883
 884 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 885    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 886    and NULL is returned instead.  */
 887
 888 basic_block
 889 split_edge_and_insert (edge e, rtx insns)
 890 {
 891   basic_block bb;
 892
 893   if (!insns)
 894     return NULL;
 895   bb = split_edge (e);
 896   emit_insn_after (insns, BB_END (bb));
 897
 898   /* ??? We used to assume that INSNS can contain control flow insns, and
 899      that we had to try to find sub basic blocks in BB to maintain a valid
 900      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 901      and call break_superblocks when going out of cfglayout mode.  But it
 902      turns out that this never happens; and that if it does ever happen,
 903      the verify_flow_info call in loop_optimizer_finalize would fail.
 904
 905      There are two reasons why we expected we could have control flow insns
 906      in INSNS.  The first is when a comparison has to be done in parts, and
 907      the second is when the number of iterations is computed for loops with
 908      the number of iterations known at runtime.  In both cases, test cases
 909      to get control flow in INSNS appear to be impossible to construct:
 910
 911       * If do_compare_rtx_and_jump needs several branches to do comparison
 912         in a mode that needs comparison by parts, we cannot analyze the
 913         number of iterations of the loop, and we never get to unrolling it.
 914
 915       * The code in expand_divmod that was suspected to cause creation of
 916         branching code seems to be only accessed for signed division.  The
 917         divisions used by # of iterations analysis are always unsigned.
 918         Problems might arise on architectures that emits branching code
 919         for some operations that may appear in the unroller (especially
 920         for division), but we have no such architectures.
 921
 922      Considering all this, it was decided that we should for now assume
 923      that INSNS can in theory contain control flow insns, but in practice
 924      it never does.  So we don't handle the theoretical case, and should
 925      a real failure ever show up, we have a pretty good clue for how to
 926      fix it.  */
 927
 928   return bb;
 929 }
 930
 931 /* Unroll LOOP for that we are able to count number of iterations in runtime
 932    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 933    extra care for case n < 0):
 934
 935    for (i = 0; i < n; i++)
 936      body;
 937
 938    ==>
 939
 940    i = 0;
 941    mod = n % 4;
 942
 943    switch (mod)
 944      {
 945        case 3:
 946          body; i++;
 947        case 2:
 948          body; i++;
 949        case 1:
 950          body; i++;
 951        case 0: ;
 952      }
 953
 954    while (i < n)
 955      {
 956        body; i++;
 957        body; i++;
 958        body; i++;
 959        body; i++;
 960      }
 961    */
 962 static void
 963 unroll_loop_runtime_iterations (struct loop *loop)
 964 {
 965   rtx old_niter, niter, init_code, branch_code, tmp;
 966   unsigned i, j, p;
 967   basic_block preheader, *body, swtch, ezc_swtch;
 968   VEC (basic_block, heap) *dom_bbs;
 969   sbitmap wont_exit;
 970   int may_exit_copy;
 971   unsigned n_peel;
 972   VEC (edge, heap) *remove_edges;
 973   edge e;
 974   bool extra_zero_check, last_may_exit;
 975   unsigned max_unroll = loop->lpt_decision.times;
 976   struct niter_desc *desc = get_simple_loop_desc (loop);
 977   bool exit_at_end = loop_exit_at_end_p (loop);
 978   struct opt_info *opt_info = NULL;
 979   bool ok;
 980
 981   if (flag_split_ivs_in_unroller
 982       || flag_variable_expansion_in_unroller)
 983     opt_info = analyze_insns_in_loop (loop);
 984
 985   /* Remember blocks whose dominators will have to be updated.  */
 986   dom_bbs = NULL;
 987
 988   body = get_loop_body (loop);
 989   for (i = 0; i < loop->num_nodes; i++)
 990     {
 991       VEC (basic_block, heap) *ldom;
 992       basic_block bb;
 993
 994       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 995       FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
 996         if (!flow_bb_inside_loop_p (loop, bb))
 997           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 998
 999       VEC_free (basic_block, heap, ldom);
1000     }
1001   free (body);
1002
1003   if (!exit_at_end)
1004     {
1005       /* Leave exit in first copy (for explanation why see comment in
1006          unroll_loop_constant_iterations).  */
1007       may_exit_copy = 0;
1008       n_peel = max_unroll - 1;
1009       extra_zero_check = true;
1010       last_may_exit = false;
1011     }
1012   else
1013     {
1014       /* Leave exit in last copy (for explanation why see comment in
1015          unroll_loop_constant_iterations).  */
1016       may_exit_copy = max_unroll;
1017       n_peel = max_unroll;
1018       extra_zero_check = false;
1019       last_may_exit = true;
1020     }
1021
1022   /* Get expression for number of iterations.  */
1023   start_sequence ();
1024   old_niter = niter = gen_reg_rtx (desc->mode);
1025   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1026   if (tmp != niter)
1027     emit_move_insn (niter, tmp);
1028
1029   /* Count modulo by ANDing it with max_unroll; we use the fact that
1030      the number of unrollings is a power of two, and thus this is correct
1031      even if there is overflow in the computation.  */
1032   niter = expand_simple_binop (desc->mode, AND,
1033                                niter,
1034                                GEN_INT (max_unroll),
1035                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1036
1037   init_code = get_insns ();
1038   end_sequence ();
1039   unshare_all_rtl_in_chain (init_code);
1040
1041   /* Precondition the loop.  */
1042   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1043
1044   remove_edges = NULL;
1045
1046   wont_exit = sbitmap_alloc (max_unroll + 2);
1047
1048   /* Peel the first copy of loop body (almost always we must leave exit test
1049      here; the only exception is when we have extra zero check and the number
1050      of iterations is reliable.  Also record the place of (possible) extra
1051      zero check.  */
1052   sbitmap_zero (wont_exit);
1053   if (extra_zero_check
1054       && !desc->noloop_assumptions)
1055     SET_BIT (wont_exit, 1);
1056   ezc_swtch = loop_preheader_edge (loop)->src;
1057   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1058                                       1, wont_exit, desc->out_edge,
1059                                       &remove_edges,
1060                                       DLTHE_FLAG_UPDATE_FREQ);
1061   gcc_assert (ok);
1062
1063   /* Record the place where switch will be built for preconditioning.  */
1064   swtch = split_edge (loop_preheader_edge (loop));
1065
1066   for (i = 0; i < n_peel; i++)
1067     {
1068       /* Peel the copy.  */
1069       sbitmap_zero (wont_exit);
1070       if (i != n_peel - 1 || !last_may_exit)
1071         SET_BIT (wont_exit, 1);
1072       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1073                                           1, wont_exit, desc->out_edge,
1074                                           &remove_edges,
1075                                           DLTHE_FLAG_UPDATE_FREQ);
1076       gcc_assert (ok);
1077
1078       /* Create item for switch.  */
1079       j = n_peel - i - (extra_zero_check ? 0 : 1);
1080       p = REG_BR_PROB_BASE / (i + 2);
1081
1082       preheader = split_edge (loop_preheader_edge (loop));
1083       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1084                                           block_label (preheader), p,
1085                                           NULL_RTX);
1086
1087       /* We rely on the fact that the compare and jump cannot be optimized out,
1088          and hence the cfg we create is correct.  */
1089       gcc_assert (branch_code != NULL_RTX);
1090
1091       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1092       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1093       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1094       e = make_edge (swtch, preheader,
1095                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1096       e->probability = p;
1097     }
1098
1099   if (extra_zero_check)
1100     {
1101       /* Add branch for zero iterations.  */
1102       p = REG_BR_PROB_BASE / (max_unroll + 1);
1103       swtch = ezc_swtch;
1104       preheader = split_edge (loop_preheader_edge (loop));
1105       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1106                                           block_label (preheader), p,
1107                                           NULL_RTX);
1108       gcc_assert (branch_code != NULL_RTX);
1109
1110       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1111       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1112       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1113       e = make_edge (swtch, preheader,
1114                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1115       e->probability = p;
1116     }
1117
1118   /* Recount dominators for outer blocks.  */
1119   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1120
1121   /* And unroll loop.  */
1122
1123   sbitmap_ones (wont_exit);
1124   RESET_BIT (wont_exit, may_exit_copy);
1125   opt_info_start_duplication (opt_info);
1126
1127   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1128                                       max_unroll,
1129                                       wont_exit, desc->out_edge,
1130                                       &remove_edges,
1131                                       DLTHE_FLAG_UPDATE_FREQ
1132                                       | (opt_info
1133                                          ? DLTHE_RECORD_COPY_NUMBER
1134                                            : 0));
1135   gcc_assert (ok);
1136
1137   if (opt_info)
1138     {
1139       apply_opt_in_copies (opt_info, max_unroll, true, true);
1140       free_opt_info (opt_info);
1141     }
1142
1143   free (wont_exit);
1144
1145   if (exit_at_end)
1146     {
1147       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1148       /* Find a new in and out edge; they are in the last copy we have
1149          made.  */
1150
1151       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1152         {
1153           desc->out_edge = EDGE_SUCC (exit_block, 0);
1154           desc->in_edge = EDGE_SUCC (exit_block, 1);
1155         }
1156       else
1157         {
1158           desc->out_edge = EDGE_SUCC (exit_block, 1);
1159           desc->in_edge = EDGE_SUCC (exit_block, 0);
1160         }
1161     }
1162
1163   /* Remove the edges.  */
1164   FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1165     remove_path (e);
1166   VEC_free (edge, heap, remove_edges);
1167
1168   /* We must be careful when updating the number of iterations due to
1169      preconditioning and the fact that the value must be valid at entry
1170      of the loop.  After passing through the above code, we see that
1171      the correct new number of iterations is this:  */
1172   gcc_assert (!desc->const_iter);
1173   desc->niter_expr =
1174     simplify_gen_binary (UDIV, desc->mode, old_niter,
1175                          GEN_INT (max_unroll + 1));
1176   desc->niter_max /= max_unroll + 1;
1177   if (exit_at_end)
1178     {
1179       desc->niter_expr =
1180         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1181       desc->noloop_assumptions = NULL_RTX;
1182       desc->niter_max--;
1183     }
1184
1185   if (dump_file)
1186     fprintf (dump_file,
1187              ";; Unrolled loop %d times, counting # of iterations "
1188              "in runtime, %i insns\n",
1189              max_unroll, num_loop_insns (loop));
1190
1191   VEC_free (basic_block, heap, dom_bbs);
1192 }
1193
1194 /* Decide whether to simply peel LOOP and how much.  */
1195 static void
1196 decide_peel_simple (struct loop *loop, int flags)
1197 {
1198   unsigned npeel;
1199   struct niter_desc *desc;
1200
1201   if (!(flags & UAP_PEEL))
1202     {
1203       /* We were not asked to, just return back silently.  */
1204       return;
1205     }
1206
1207   if (dump_file)
1208     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1209
1210   /* npeel = number of iterations to peel.  */
1211   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1212   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1213     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1214
1215   /* Skip big loops.  */
1216   if (!npeel)
1217     {
1218       if (dump_file)
1219         fprintf (dump_file, ";; Not considering loop, is too big\n");
1220       return;
1221     }
1222
1223   /* Check for simple loops.  */
1224   desc = get_simple_loop_desc (loop);
1225
1226   /* Check number of iterations.  */
1227   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1228     {
1229       if (dump_file)
1230         fprintf (dump_file, ";; Loop iterates constant times\n");
1231       return;
1232     }
1233
1234   /* Do not simply peel loops with branches inside -- it increases number
1235      of mispredicts.  */
1236   if (num_loop_branches (loop) > 1)
1237     {
1238       if (dump_file)
1239         fprintf (dump_file, ";; Not peeling, contains branches\n");
1240       return;
1241     }
1242
1243   if (loop->header->count)
1244     {
1245       unsigned niter = expected_loop_iterations (loop);
1246       if (niter + 1 > npeel)
1247         {
1248           if (dump_file)
1249             {
1250               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1251               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1252                        (HOST_WIDEST_INT) (niter + 1));
1253               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1254                        npeel);
1255             }
1256           return;
1257         }
1258       npeel = niter + 1;
1259     }
1260   else
1261     {
1262       /* For now we have no good heuristics to decide whether loop peeling
1263          will be effective, so disable it.  */
1264       if (dump_file)
1265         fprintf (dump_file,
1266                  ";; Not peeling loop, no evidence it will be profitable\n");
1267       return;
1268     }
1269
1270   /* Success.  */
1271   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1272   loop->lpt_decision.times = npeel;
1273
1274   if (dump_file)
1275     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1276              loop->lpt_decision.times);
1277 }
1278
1279 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1280    while (cond)
1281      body;
1282
1283    ==>
1284
1285    if (!cond) goto end;
1286    body;
1287    if (!cond) goto end;
1288    body;
1289    while (cond)
1290      body;
1291    end: ;
1292    */
1293 static void
1294 peel_loop_simple (struct loop *loop)
1295 {
1296   sbitmap wont_exit;
1297   unsigned npeel = loop->lpt_decision.times;
1298   struct niter_desc *desc = get_simple_loop_desc (loop);
1299   struct opt_info *opt_info = NULL;
1300   bool ok;
1301
1302   if (flag_split_ivs_in_unroller && npeel > 1)
1303     opt_info = analyze_insns_in_loop (loop);
1304
1305   wont_exit = sbitmap_alloc (npeel + 1);
1306   sbitmap_zero (wont_exit);
1307
1308   opt_info_start_duplication (opt_info);
1309
1310   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1311                                       npeel, wont_exit, NULL,
1312                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1313                                       | (opt_info
1314                                          ? DLTHE_RECORD_COPY_NUMBER
1315                                            : 0));
1316   gcc_assert (ok);
1317
1318   free (wont_exit);
1319
1320   if (opt_info)
1321     {
1322       apply_opt_in_copies (opt_info, npeel, false, false);
1323       free_opt_info (opt_info);
1324     }
1325
1326   if (desc->simple_p)
1327     {
1328       if (desc->const_iter)
1329         {
1330           desc->niter -= npeel;
1331           desc->niter_expr = GEN_INT (desc->niter);
1332           desc->noloop_assumptions = NULL_RTX;
1333         }
1334       else
1335         {
1336           /* We cannot just update niter_expr, as its value might be clobbered
1337              inside loop.  We could handle this by counting the number into
1338              temporary just like we do in runtime unrolling, but it does not
1339              seem worthwhile.  */
1340           free_simple_loop_desc (loop);
1341         }
1342     }
1343   if (dump_file)
1344     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1345 }
1346
1347 /* Decide whether to unroll LOOP stupidly and how much.  */
1348 static void
1349 decide_unroll_stupid (struct loop *loop, int flags)
1350 {
1351   unsigned nunroll, nunroll_by_av, i;
1352   struct niter_desc *desc;
1353
1354   if (!(flags & UAP_UNROLL_ALL))
1355     {
1356       /* We were not asked to, just return back silently.  */
1357       return;
1358     }
1359
1360   if (dump_file)
1361     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1362
1363   /* nunroll = total number of copies of the original loop body in
1364      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1365   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1366   nunroll_by_av
1367     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1368   if (nunroll > nunroll_by_av)
1369     nunroll = nunroll_by_av;
1370   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1371     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1372
1373   if (targetm.loop_unroll_adjust)
1374     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1375
1376   /* Skip big loops.  */
1377   if (nunroll <= 1)
1378     {
1379       if (dump_file)
1380         fprintf (dump_file, ";; Not considering loop, is too big\n");
1381       return;
1382     }
1383
1384   /* Check for simple loops.  */
1385   desc = get_simple_loop_desc (loop);
1386
1387   /* Check simpleness.  */
1388   if (desc->simple_p && !desc->assumptions)
1389     {
1390       if (dump_file)
1391         fprintf (dump_file, ";; The loop is simple\n");
1392       return;
1393     }
1394
1395   /* Do not unroll loops with branches inside -- it increases number
1396      of mispredicts.  */
1397   if (num_loop_branches (loop) > 1)
1398     {
1399       if (dump_file)
1400         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1401       return;
1402     }
1403
1404   /* If we have profile feedback, check whether the loop rolls.  */
1405   if (loop->header->count
1406       && expected_loop_iterations (loop) < 2 * nunroll)
1407     {
1408       if (dump_file)
1409         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1410       return;
1411     }
1412
1413   /* Success.  Now force nunroll to be power of 2, as it seems that this
1414      improves results (partially because of better alignments, partially
1415      because of some dark magic).  */
1416   for (i = 1; 2 * i <= nunroll; i *= 2)
1417     continue;
1418
1419   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1420   loop->lpt_decision.times = i - 1;
1421
1422   if (dump_file)
1423     fprintf (dump_file,
1424              ";; Decided to unroll the loop stupidly, %d times.\n",
1425              loop->lpt_decision.times);
1426 }
1427
1428 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1429    while (cond)
1430      body;
1431
1432    ==>
1433
1434    while (cond)
1435      {
1436        body;
1437        if (!cond) break;
1438        body;
1439        if (!cond) break;
1440        body;
1441        if (!cond) break;
1442        body;
1443      }
1444    */
1445 static void
1446 unroll_loop_stupid (struct loop *loop)
1447 {
1448   sbitmap wont_exit;
1449   unsigned nunroll = loop->lpt_decision.times;
1450   struct niter_desc *desc = get_simple_loop_desc (loop);
1451   struct opt_info *opt_info = NULL;
1452   bool ok;
1453
1454   if (flag_split_ivs_in_unroller
1455       || flag_variable_expansion_in_unroller)
1456     opt_info = analyze_insns_in_loop (loop);
1457
1458
1459   wont_exit = sbitmap_alloc (nunroll + 1);
1460   sbitmap_zero (wont_exit);
1461   opt_info_start_duplication (opt_info);
1462
1463   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1464                                       nunroll, wont_exit,
1465                                       NULL, NULL,
1466                                       DLTHE_FLAG_UPDATE_FREQ
1467                                       | (opt_info
1468                                          ? DLTHE_RECORD_COPY_NUMBER
1469                                            : 0));
1470   gcc_assert (ok);
1471
1472   if (opt_info)
1473     {
1474       apply_opt_in_copies (opt_info, nunroll, true, true);
1475       free_opt_info (opt_info);
1476     }
1477
1478   free (wont_exit);
1479
1480   if (desc->simple_p)
1481     {
1482       /* We indeed may get here provided that there are nontrivial assumptions
1483          for a loop to be really simple.  We could update the counts, but the
1484          problem is that we are unable to decide which exit will be taken
1485          (not really true in case the number of iterations is constant,
1486          but noone will do anything with this information, so we do not
1487          worry about it).  */
1488       desc->simple_p = false;
1489     }
1490
1491   if (dump_file)
1492     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1493              nunroll, num_loop_insns (loop));
1494 }
1495
1496 /* A hash function for information about insns to split.  */
1497
1498 static hashval_t
1499 si_info_hash (const void *ivts)
1500 {
1501   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1502 }
1503
1504 /* An equality functions for information about insns to split.  */
1505
1506 static int
1507 si_info_eq (const void *ivts1, const void *ivts2)
1508 {
1509   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1510   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1511
1512   return i1->insn == i2->insn;
1513 }
1514
1515 /* Return a hash for VES, which is really a "var_to_expand *".  */
1516
1517 static hashval_t
1518 ve_info_hash (const void *ves)
1519 {
1520   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1521 }
1522
1523 /* Return true if IVTS1 and IVTS2 (which are really both of type
1524    "var_to_expand *") refer to the same instruction.  */
1525
1526 static int
1527 ve_info_eq (const void *ivts1, const void *ivts2)
1528 {
1529   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1530   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1531
1532   return i1->insn == i2->insn;
1533 }
1534
1535 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1536    Set *DEBUG_USES to the number of debug insns that reference the
1537    variable.  */
1538
1539 bool
1540 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1541                                   int *debug_uses)
1542 {
1543   basic_block *body, bb;
1544   unsigned i;
1545   int count_ref = 0;
1546   rtx insn;
1547
1548   body = get_loop_body (loop);
1549   for (i = 0; i < loop->num_nodes; i++)
1550     {
1551       bb = body[i];
1552
1553       FOR_BB_INSNS (bb, insn)
1554         if (!rtx_referenced_p (reg, insn))
1555           continue;
1556         else if (DEBUG_INSN_P (insn))
1557           ++*debug_uses;
1558         else if (++count_ref > 1)
1559           break;
1560     }
1561   free (body);
1562   return (count_ref  == 1);
1563 }
1564
1565 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1566
1567 static void
1568 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1569 {
1570   basic_block *body, bb;
1571   unsigned i;
1572   rtx insn;
1573
1574   body = get_loop_body (loop);
1575   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1576     {
1577       bb = body[i];
1578
1579       FOR_BB_INSNS (bb, insn)
1580         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1581           continue;
1582         else
1583           {
1584             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1585                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1586             if (!--debug_uses)
1587               break;
1588           }
1589     }
1590   free (body);
1591 }
1592
1593 /* Determine whether INSN contains an accumulator
1594    which can be expanded into separate copies,
1595    one for each copy of the LOOP body.
1596
1597    for (i = 0 ; i < n; i++)
1598      sum += a[i];
1599
1600    ==>
1601
1602    sum += a[i]
1603    ....
1604    i = i+1;
1605    sum1 += a[i]
1606    ....
1607    i = i+1
1608    sum2 += a[i];
1609    ....
1610
1611    Return NULL if INSN contains no opportunity for expansion of accumulator.
1612    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1613    information and return a pointer to it.
1614 */
1615
1616 static struct var_to_expand *
1617 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1618 {
1619   rtx set, dest, src;
1620   struct var_to_expand *ves;
1621   unsigned accum_pos;
1622   enum rtx_code code;
1623   int debug_uses = 0;
1624
1625   set = single_set (insn);
1626   if (!set)
1627     return NULL;
1628
1629   dest = SET_DEST (set);
1630   src = SET_SRC (set);
1631   code = GET_CODE (src);
1632
1633   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1634     return NULL;
1635
1636   if (FLOAT_MODE_P (GET_MODE (dest)))
1637     {
1638       if (!flag_associative_math)
1639         return NULL;
1640       /* In the case of FMA, we're also changing the rounding.  */
1641       if (code == FMA && !flag_unsafe_math_optimizations)
1642         return NULL;
1643     }
1644
1645   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1646      in MD.  But if there is no optab to generate the insn, we can not
1647      perform the variable expansion.  This can happen if an MD provides
1648      an insn but not a named pattern to generate it, for example to avoid
1649      producing code that needs additional mode switches like for x87/mmx.
1650
1651      So we check have_insn_for which looks for an optab for the operation
1652      in SRC.  If it doesn't exist, we can't perform the expansion even
1653      though INSN is valid.  */
1654   if (!have_insn_for (code, GET_MODE (src)))
1655     return NULL;
1656
1657   if (!REG_P (dest)
1658       && !(GET_CODE (dest) == SUBREG
1659            && REG_P (SUBREG_REG (dest))))
1660     return NULL;
1661
1662   /* Find the accumulator use within the operation.  */
1663   if (code == FMA)
1664     {
1665       /* We only support accumulation via FMA in the ADD position.  */
1666       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1667         return NULL;
1668       accum_pos = 2;
1669     }
1670   else if (rtx_equal_p (dest, XEXP (src, 0)))
1671     accum_pos = 0;
1672   else if (rtx_equal_p (dest, XEXP (src, 1)))
1673     {
1674       /* The method of expansion that we are using; which includes the
1675          initialization of the expansions with zero and the summation of
1676          the expansions at the end of the computation will yield wrong
1677          results for (x = something - x) thus avoid using it in that case.  */
1678       if (code == MINUS)
1679         return NULL;
1680       accum_pos = 1;
1681     }
1682   else
1683     return NULL;
1684
1685   /* It must not otherwise be used.  */
1686   if (code == FMA)
1687     {
1688       if (rtx_referenced_p (dest, XEXP (src, 0))
1689           || rtx_referenced_p (dest, XEXP (src, 1)))
1690         return NULL;
1691     }
1692   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1693     return NULL;
1694
1695   /* It must be used in exactly one insn.  */
1696   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1697     return NULL;
1698
1699   if (dump_file)
1700     {
1701       fprintf (dump_file, "\n;; Expanding Accumulator ");
1702       print_rtl (dump_file, dest);
1703       fprintf (dump_file, "\n");
1704     }
1705
1706   if (debug_uses)
1707     /* Instead of resetting the debug insns, we could replace each
1708        debug use in the loop with the sum or product of all expanded
1709        accummulators.  Since we'll only know of all expansions at the
1710        end, we'd have to keep track of which vars_to_expand a debug
1711        insn in the loop references, take note of each copy of the
1712        debug insn during unrolling, and when it's all done, compute
1713        the sum or product of each variable and adjust the original
1714        debug insn and each copy thereof.  What a pain!  */
1715     reset_debug_uses_in_loop (loop, dest, debug_uses);
1716
1717   /* Record the accumulator to expand.  */
1718   ves = XNEW (struct var_to_expand);
1719   ves->insn = insn;
1720   ves->reg = copy_rtx (dest);
1721   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1722   ves->next = NULL;
1723   ves->op = GET_CODE (src);
1724   ves->expansion_count = 0;
1725   ves->reuse_expansion = 0;
1726   ves->accum_pos = accum_pos;
1727   return ves;
1728 }
1729
1730 /* Determine whether there is an induction variable in INSN that
1731    we would like to split during unrolling.
1732
1733    I.e. replace
1734
1735    i = i + 1;
1736    ...
1737    i = i + 1;
1738    ...
1739    i = i + 1;
1740    ...
1741
1742    type chains by
1743
1744    i0 = i + 1
1745    ...
1746    i = i0 + 1
1747    ...
1748    i = i0 + 2
1749    ...
1750
1751    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1752    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1753    pointer to it.  */
1754
1755 static struct iv_to_split *
1756 analyze_iv_to_split_insn (rtx insn)
1757 {
1758   rtx set, dest;
1759   struct rtx_iv iv;
1760   struct iv_to_split *ivts;
1761   bool ok;
1762
1763   /* For now we just split the basic induction variables.  Later this may be
1764      extended for example by selecting also addresses of memory references.  */
1765   set = single_set (insn);
1766   if (!set)
1767     return NULL;
1768
1769   dest = SET_DEST (set);
1770   if (!REG_P (dest))
1771     return NULL;
1772
1773   if (!biv_p (insn, dest))
1774     return NULL;
1775
1776   ok = iv_analyze_result (insn, dest, &iv);
1777
1778   /* This used to be an assert under the assumption that if biv_p returns
1779      true that iv_analyze_result must also return true.  However, that
1780      assumption is not strictly correct as evidenced by pr25569.
1781
1782      Returning NULL when iv_analyze_result returns false is safe and
1783      avoids the problems in pr25569 until the iv_analyze_* routines
1784      can be fixed, which is apparently hard and time consuming
1785      according to their author.  */
1786   if (! ok)
1787     return NULL;
1788
1789   if (iv.step == const0_rtx
1790       || iv.mode != iv.extend_mode)
1791     return NULL;
1792
1793   /* Record the insn to split.  */
1794   ivts = XNEW (struct iv_to_split);
1795   ivts->insn = insn;
1796   ivts->base_var = NULL_RTX;
1797   ivts->step = iv.step;
1798   ivts->next = NULL;
1799   ivts->n_loc = 1;
1800   ivts->loc[0] = 1;
1801
1802   return ivts;
1803 }
1804
1805 /* Determines which of insns in LOOP can be optimized.
1806    Return a OPT_INFO struct with the relevant hash tables filled
1807    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1808    is undefined for the return value.  */
1809
1810 static struct opt_info *
1811 analyze_insns_in_loop (struct loop *loop)
1812 {
1813   basic_block *body, bb;
1814   unsigned i;
1815   struct opt_info *opt_info = XCNEW (struct opt_info);
1816   rtx insn;
1817   struct iv_to_split *ivts = NULL;
1818   struct var_to_expand *ves = NULL;
1819   PTR *slot1;
1820   PTR *slot2;
1821   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1822   edge exit;
1823   bool can_apply = false;
1824
1825   iv_analysis_loop_init (loop);
1826
1827   body = get_loop_body (loop);
1828
1829   if (flag_split_ivs_in_unroller)
1830     {
1831       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1832                                               si_info_hash, si_info_eq, free);
1833       opt_info->iv_to_split_head = NULL;
1834       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1835     }
1836
1837   /* Record the loop exit bb and loop preheader before the unrolling.  */
1838   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1839
1840   if (VEC_length (edge, edges) == 1)
1841     {
1842       exit = VEC_index (edge, edges, 0);
1843       if (!(exit->flags & EDGE_COMPLEX))
1844         {
1845           opt_info->loop_exit = split_edge (exit);
1846           can_apply = true;
1847         }
1848     }
1849
1850   if (flag_variable_expansion_in_unroller
1851       && can_apply)
1852     {
1853       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1854                                                         ve_info_hash,
1855                                                         ve_info_eq, free);
1856       opt_info->var_to_expand_head = NULL;
1857       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1858     }
1859
1860   for (i = 0; i < loop->num_nodes; i++)
1861     {
1862       bb = body[i];
1863       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1864         continue;
1865
1866       FOR_BB_INSNS (bb, insn)
1867       {
1868         if (!INSN_P (insn))
1869           continue;
1870
1871         if (opt_info->insns_to_split)
1872           ivts = analyze_iv_to_split_insn (insn);
1873
1874         if (ivts)
1875           {
1876             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1877             gcc_assert (*slot1 == NULL);
1878             *slot1 = ivts;
1879             *opt_info->iv_to_split_tail = ivts;
1880             opt_info->iv_to_split_tail = &ivts->next;
1881             continue;
1882           }
1883
1884         if (opt_info->insns_with_var_to_expand)
1885           ves = analyze_insn_to_expand_var (loop, insn);
1886
1887         if (ves)
1888           {
1889             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1890             gcc_assert (*slot2 == NULL);
1891             *slot2 = ves;
1892             *opt_info->var_to_expand_tail = ves;
1893             opt_info->var_to_expand_tail = &ves->next;
1894           }
1895       }
1896     }
1897
1898   VEC_free (edge, heap, edges);
1899   free (body);
1900   return opt_info;
1901 }
1902
1903 /* Called just before loop duplication.  Records start of duplicated area
1904    to OPT_INFO.  */
1905
1906 static void
1907 opt_info_start_duplication (struct opt_info *opt_info)
1908 {
1909   if (opt_info)
1910     opt_info->first_new_block = last_basic_block;
1911 }
1912
1913 /* Determine the number of iterations between initialization of the base
1914    variable and the current copy (N_COPY).  N_COPIES is the total number
1915    of newly created copies.  UNROLLING is true if we are unrolling
1916    (not peeling) the loop.  */
1917
1918 static unsigned
1919 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1920 {
1921   if (unrolling)
1922     {
1923       /* If we are unrolling, initialization is done in the original loop
1924          body (number 0).  */
1925       return n_copy;
1926     }
1927   else
1928     {
1929       /* If we are peeling, the copy in that the initialization occurs has
1930          number 1.  The original loop (number 0) is the last.  */
1931       if (n_copy)
1932         return n_copy - 1;
1933       else
1934         return n_copies;
1935     }
1936 }
1937
1938 /* Locate in EXPR the expression corresponding to the location recorded
1939    in IVTS, and return a pointer to the RTX for this location.  */
1940
1941 static rtx *
1942 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1943 {
1944   unsigned i;
1945   rtx *ret = &expr;
1946
1947   for (i = 0; i < ivts->n_loc; i++)
1948     ret = &XEXP (*ret, ivts->loc[i]);
1949
1950   return ret;
1951 }
1952
1953 /* Allocate basic variable for the induction variable chain.  */
1954
1955 static void
1956 allocate_basic_variable (struct iv_to_split *ivts)
1957 {
1958   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1959
1960   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1961 }
1962
1963 /* Insert initialization of basic variable of IVTS before INSN, taking
1964    the initial value from INSN.  */
1965
1966 static void
1967 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1968 {
1969   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1970   rtx seq;
1971
1972   start_sequence ();
1973   expr = force_operand (expr, ivts->base_var);
1974   if (expr != ivts->base_var)
1975     emit_move_insn (ivts->base_var, expr);
1976   seq = get_insns ();
1977   end_sequence ();
1978
1979   emit_insn_before (seq, insn);
1980 }
1981
1982 /* Replace the use of induction variable described in IVTS in INSN
1983    by base variable + DELTA * step.  */
1984
1985 static void
1986 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1987 {
1988   rtx expr, *loc, seq, incr, var;
1989   enum machine_mode mode = GET_MODE (ivts->base_var);
1990   rtx src, dest, set;
1991
1992   /* Construct base + DELTA * step.  */
1993   if (!delta)
1994     expr = ivts->base_var;
1995   else
1996     {
1997       incr = simplify_gen_binary (MULT, mode,
1998                                   ivts->step, gen_int_mode (delta, mode));
1999       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2000                                   ivts->base_var, incr);
2001     }
2002
2003   /* Figure out where to do the replacement.  */
2004   loc = get_ivts_expr (single_set (insn), ivts);
2005
2006   /* If we can make the replacement right away, we're done.  */
2007   if (validate_change (insn, loc, expr, 0))
2008     return;
2009
2010   /* Otherwise, force EXPR into a register and try again.  */
2011   start_sequence ();
2012   var = gen_reg_rtx (mode);
2013   expr = force_operand (expr, var);
2014   if (expr != var)
2015     emit_move_insn (var, expr);
2016   seq = get_insns ();
2017   end_sequence ();
2018   emit_insn_before (seq, insn);
2019
2020   if (validate_change (insn, loc, var, 0))
2021     return;
2022
2023   /* The last chance.  Try recreating the assignment in insn
2024      completely from scratch.  */
2025   set = single_set (insn);
2026   gcc_assert (set);
2027
2028   start_sequence ();
2029   *loc = var;
2030   src = copy_rtx (SET_SRC (set));
2031   dest = copy_rtx (SET_DEST (set));
2032   src = force_operand (src, dest);
2033   if (src != dest)
2034     emit_move_insn (dest, src);
2035   seq = get_insns ();
2036   end_sequence ();
2037
2038   emit_insn_before (seq, insn);
2039   delete_insn (insn);
2040 }
2041
2042
2043 /* Return one expansion of the accumulator recorded in struct VE.  */
2044
2045 static rtx
2046 get_expansion (struct var_to_expand *ve)
2047 {
2048   rtx reg;
2049
2050   if (ve->reuse_expansion == 0)
2051     reg = ve->reg;
2052   else
2053     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2054
2055   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2056     ve->reuse_expansion = 0;
2057   else
2058     ve->reuse_expansion++;
2059
2060   return reg;
2061 }
2062
2063
2064 /* Given INSN replace the uses of the accumulator recorded in VE
2065    with a new register.  */
2066
2067 static void
2068 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2069 {
2070   rtx new_reg, set;
2071   bool really_new_expansion = false;
2072
2073   set = single_set (insn);
2074   gcc_assert (set);
2075
2076   /* Generate a new register only if the expansion limit has not been
2077      reached.  Else reuse an already existing expansion.  */
2078   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2079     {
2080       really_new_expansion = true;
2081       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2082     }
2083   else
2084     new_reg = get_expansion (ve);
2085
2086   validate_change (insn, &SET_DEST (set), new_reg, 1);
2087   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2088
2089   if (apply_change_group ())
2090     if (really_new_expansion)
2091       {
2092         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2093         ve->expansion_count++;
2094       }
2095 }
2096
2097 /* Initialize the variable expansions in loop preheader.  PLACE is the
2098    loop-preheader basic block where the initialization of the
2099    expansions should take place.  The expansions are initialized with
2100    (-0) when the operation is plus or minus to honor sign zero.  This
2101    way we can prevent cases where the sign of the final result is
2102    effected by the sign of the expansion.  Here is an example to
2103    demonstrate this:
2104
2105    for (i = 0 ; i < n; i++)
2106      sum += something;
2107
2108    ==>
2109
2110    sum += something
2111    ....
2112    i = i+1;
2113    sum1 += something
2114    ....
2115    i = i+1
2116    sum2 += something;
2117    ....
2118
2119    When SUM is initialized with -zero and SOMETHING is also -zero; the
2120    final result of sum should be -zero thus the expansions sum1 and sum2
2121    should be initialized with -zero as well (otherwise we will get +zero
2122    as the final result).  */
2123
2124 static void
2125 insert_var_expansion_initialization (struct var_to_expand *ve,
2126                                      basic_block place)
2127 {
2128   rtx seq, var, zero_init, insn;
2129   unsigned i;
2130   enum machine_mode mode = GET_MODE (ve->reg);
2131   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2132
2133   if (VEC_length (rtx, ve->var_expansions) == 0)
2134     return;
2135
2136   start_sequence ();
2137   switch (ve->op)
2138     {
2139     case FMA:
2140       /* Note that we only accumulate FMA via the ADD operand.  */
2141     case PLUS:
2142     case MINUS:
2143       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2144         {
2145           if (honor_signed_zero_p)
2146             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2147           else
2148             zero_init = CONST0_RTX (mode);
2149           emit_move_insn (var, zero_init);
2150         }
2151       break;
2152
2153     case MULT:
2154       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2155         {
2156           zero_init = CONST1_RTX (GET_MODE (var));
2157           emit_move_insn (var, zero_init);
2158         }
2159       break;
2160
2161     default:
2162       gcc_unreachable ();
2163     }
2164
2165   seq = get_insns ();
2166   end_sequence ();
2167
2168   insn = BB_HEAD (place);
2169   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2170     insn = NEXT_INSN (insn);
2171
2172   emit_insn_after (seq, insn);
2173 }
2174
2175 /* Combine the variable expansions at the loop exit.  PLACE is the
2176    loop exit basic block where the summation of the expansions should
2177    take place.  */
2178
2179 static void
2180 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2181 {
2182   rtx sum = ve->reg;
2183   rtx expr, seq, var, insn;
2184   unsigned i;
2185
2186   if (VEC_length (rtx, ve->var_expansions) == 0)
2187     return;
2188
2189   start_sequence ();
2190   switch (ve->op)
2191     {
2192     case FMA:
2193       /* Note that we only accumulate FMA via the ADD operand.  */
2194     case PLUS:
2195     case MINUS:
2196       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2197         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2198       break;
2199
2200     case MULT:
2201       FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2202         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2203       break;
2204
2205     default:
2206       gcc_unreachable ();
2207     }
2208
2209   expr = force_operand (sum, ve->reg);
2210   if (expr != ve->reg)
2211     emit_move_insn (ve->reg, expr);
2212   seq = get_insns ();
2213   end_sequence ();
2214
2215   insn = BB_HEAD (place);
2216   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2217     insn = NEXT_INSN (insn);
2218
2219   emit_insn_after (seq, insn);
2220 }
2221
2222 /* Apply loop optimizations in loop copies using the
2223    data which gathered during the unrolling.  Structure
2224    OPT_INFO record that data.
2225
2226    UNROLLING is true if we unrolled (not peeled) the loop.
2227    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2228    the loop (as it should happen in complete unrolling, but not in ordinary
2229    peeling of the loop).  */
2230
2231 static void
2232 apply_opt_in_copies (struct opt_info *opt_info,
2233                      unsigned n_copies, bool unrolling,
2234                      bool rewrite_original_loop)
2235 {
2236   unsigned i, delta;
2237   basic_block bb, orig_bb;
2238   rtx insn, orig_insn, next;
2239   struct iv_to_split ivts_templ, *ivts;
2240   struct var_to_expand ve_templ, *ves;
2241
2242   /* Sanity check -- we need to put initialization in the original loop
2243      body.  */
2244   gcc_assert (!unrolling || rewrite_original_loop);
2245
2246   /* Allocate the basic variables (i0).  */
2247   if (opt_info->insns_to_split)
2248     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2249       allocate_basic_variable (ivts);
2250
2251   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2252     {
2253       bb = BASIC_BLOCK (i);
2254       orig_bb = get_bb_original (bb);
2255
2256       /* bb->aux holds position in copy sequence initialized by
2257          duplicate_loop_to_header_edge.  */
2258       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2259                                         unrolling);
2260       bb->aux = 0;
2261       orig_insn = BB_HEAD (orig_bb);
2262       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2263         {
2264           next = NEXT_INSN (insn);
2265           if (!INSN_P (insn))
2266             continue;
2267
2268           while (!INSN_P (orig_insn))
2269             orig_insn = NEXT_INSN (orig_insn);
2270
2271           ivts_templ.insn = orig_insn;
2272           ve_templ.insn = orig_insn;
2273
2274           /* Apply splitting iv optimization.  */
2275           if (opt_info->insns_to_split)
2276             {
2277               ivts = (struct iv_to_split *)
2278                 htab_find (opt_info->insns_to_split, &ivts_templ);
2279
2280               if (ivts)
2281                 {
2282                   gcc_assert (GET_CODE (PATTERN (insn))
2283                               == GET_CODE (PATTERN (orig_insn)));
2284
2285                   if (!delta)
2286                     insert_base_initialization (ivts, insn);
2287                   split_iv (ivts, insn, delta);
2288                 }
2289             }
2290           /* Apply variable expansion optimization.  */
2291           if (unrolling && opt_info->insns_with_var_to_expand)
2292             {
2293               ves = (struct var_to_expand *)
2294                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2295               if (ves)
2296                 {
2297                   gcc_assert (GET_CODE (PATTERN (insn))
2298                               == GET_CODE (PATTERN (orig_insn)));
2299                   expand_var_during_unrolling (ves, insn);
2300                 }
2301             }
2302           orig_insn = NEXT_INSN (orig_insn);
2303         }
2304     }
2305
2306   if (!rewrite_original_loop)
2307     return;
2308
2309   /* Initialize the variable expansions in the loop preheader
2310      and take care of combining them at the loop exit.  */
2311   if (opt_info->insns_with_var_to_expand)
2312     {
2313       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2314         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2315       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2316         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2317     }
2318
2319   /* Rewrite also the original loop body.  Find them as originals of the blocks
2320      in the last copied iteration, i.e. those that have
2321      get_bb_copy (get_bb_original (bb)) == bb.  */
2322   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2323     {
2324       bb = BASIC_BLOCK (i);
2325       orig_bb = get_bb_original (bb);
2326       if (get_bb_copy (orig_bb) != bb)
2327         continue;
2328
2329       delta = determine_split_iv_delta (0, n_copies, unrolling);
2330       for (orig_insn = BB_HEAD (orig_bb);
2331            orig_insn != NEXT_INSN (BB_END (bb));
2332            orig_insn = next)
2333         {
2334           next = NEXT_INSN (orig_insn);
2335
2336           if (!INSN_P (orig_insn))
2337             continue;
2338
2339           ivts_templ.insn = orig_insn;
2340           if (opt_info->insns_to_split)
2341             {
2342               ivts = (struct iv_to_split *)
2343                 htab_find (opt_info->insns_to_split, &ivts_templ);
2344               if (ivts)
2345                 {
2346                   if (!delta)
2347                     insert_base_initialization (ivts, orig_insn);
2348                   split_iv (ivts, orig_insn, delta);
2349                   continue;
2350                 }
2351             }
2352
2353         }
2354     }
2355 }
2356
2357 /* Release OPT_INFO.  */
2358
2359 static void
2360 free_opt_info (struct opt_info *opt_info)
2361 {
2362   if (opt_info->insns_to_split)
2363     htab_delete (opt_info->insns_to_split);
2364   if (opt_info->insns_with_var_to_expand)
2365     {
2366       struct var_to_expand *ves;
2367
2368       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2369         VEC_free (rtx, heap, ves->var_expansions);
2370       htab_delete (opt_info->insns_with_var_to_expand);
2371     }
2372   free (opt_info);
2373 }