gcc/tree-vectorizer.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Loop Vectorization Pass.
  22
  23    This pass tries to vectorize loops. This first implementation focuses on
  24    simple inner-most loops, with no conditional control flow, and a set of
  25    simple operations which vector form can be expressed using existing
  26    tree codes (PLUS, MULT etc).
  27
  28    For example, the vectorizer transforms the following simple loop:
  29
  30         short a[N]; short b[N]; short c[N]; int i;
  31
  32         for (i=0; i<N; i++){
  33           a[i] = b[i] + c[i];
  34         }
  35
  36    as if it was manually vectorized by rewriting the source code into:
  37
  38         typedef int __attribute__((mode(V8HI))) v8hi;
  39         short a[N];  short b[N]; short c[N];   int i;
  40         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  41         v8hi va, vb, vc;
  42
  43         for (i=0; i<N/8; i++){
  44           vb = pb[i];
  45           vc = pc[i];
  46           va = vb + vc;
  47           pa[i] = va;
  48         }
  49
  50         The main entry to this pass is vectorize_loops(), in which
  51    the vectorizer applies a set of analyses on a given set of loops,
  52    followed by the actual vectorization transformation for the loops that
  53    had successfully passed the analysis phase.
  54
  55         Throughout this pass we make a distinction between two types of
  56    data: scalars (which are represented by SSA_NAMES), and memory references
  57    ("data-refs"). These two types of data require different handling both
  58    during analysis and transformation. The types of data-refs that the
  59    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  60    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  61    accesses are required to have a  simple (consecutive) access pattern.
  62
  63    Analysis phase:
  64    ===============
  65         The driver for the analysis phase is vect_analyze_loop_nest().
  66    It applies a set of analyses, some of which rely on the scalar evolution
  67    analyzer (scev) developed by Sebastian Pop.
  68
  69         During the analysis phase the vectorizer records some information
  70    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  71    loop, as well as general information about the loop as a whole, which is
  72    recorded in a "loop_vec_info" struct attached to each loop.
  73
  74    Transformation phase:
  75    =====================
  76         The loop transformation phase scans all the stmts in the loop, and
  77    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  78    the loop that needs to be vectorized. It insert the vector code sequence
  79    just before the scalar stmt S, and records a pointer to the vector code
  80    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
  81    attached to S). This pointer will be used for the vectorization of following
  82    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
  83    otherwise, we rely on dead code elimination for removing it.
  84
  85         For example, say stmt S1 was vectorized into stmt VS1:
  86
  87    VS1: vb = px[i];
  88    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  89    S2:  a = b;
  90
  91    To vectorize stmt S2, the vectorizer first finds the stmt that defines
  92    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
  93    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
  94    resulting sequence would be:
  95
  96    VS1: vb = px[i];
  97    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  98    VS2: va = vb;
  99    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 100
 101         Operands that are not SSA_NAMEs, are data-refs that appear in
 102    load/store operations (like 'x[i]' in S1), and are handled differently.
 103
 104    Target modeling:
 105    =================
 106         Currently the only target specific information that is used is the
 107    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 108    support different sizes of vectors, for now will need to specify one value
 109    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 110
 111         Since we only vectorize operations which vector form can be
 112    expressed using existing tree codes, to verify that an operation is
 113    supported, the vectorizer checks the relevant optab at the relevant
 114    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 115    the value found is CODE_FOR_nothing, then there's no target support, and
 116    we can't vectorize the stmt.
 117
 118    For additional information on this project see:
 119    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 120 */
 121
 122 #include "config.h"
 123 #include "system.h"
 124 #include "coretypes.h"
 125 #include "tm.h"
 126 #include "ggc.h"
 127 #include "tree.h"
 128 #include "target.h"
 129 #include "rtl.h"
 130 #include "basic-block.h"
 131 #include "diagnostic.h"
 132 #include "tree-flow.h"
 133 #include "tree-dump.h"
 134 #include "timevar.h"
 135 #include "cfgloop.h"
 136 #include "cfglayout.h"
 137 #include "expr.h"
 138 #include "recog.h"
 139 #include "optabs.h"
 140 #include "params.h"
 141 #include "toplev.h"
 142 #include "tree-chrec.h"
 143 #include "tree-data-ref.h"
 144 #include "tree-scalar-evolution.h"
 145 #include "input.h"
 146 #include "tree-vectorizer.h"
 147 #include "tree-pass.h"
 148
 149 /*************************************************************************
 150   General Vectorization Utilities
 151  *************************************************************************/
 152
 153 /* vect_dump will be set to stderr or dump_file if exist.  */
 154 FILE *vect_dump;
 155
 156 /* vect_verbosity_level set to an invalid value
 157    to mark that it's uninitialized.  */
 158 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
 159
 160 /* Loop location.  */
 161 static LOC vect_loop_location;
 162
 163 /* Bitmap of virtual variables to be renamed.  */
 164 bitmap vect_memsyms_to_rename;
 165 \f
 166 /*************************************************************************
 167   Simple Loop Peeling Utilities
 168
 169   Utilities to support loop peeling for vectorization purposes.
 170  *************************************************************************/
 171
 172
 173 /* Renames the use *OP_P.  */
 174
 175 static void
 176 rename_use_op (use_operand_p op_p)
 177 {
 178   tree new_name;
 179
 180   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
 181     return;
 182
 183   new_name = get_current_def (USE_FROM_PTR (op_p));
 184
 185   /* Something defined outside of the loop.  */
 186   if (!new_name)
 187     return;
 188
 189   /* An ordinary ssa name defined in the loop.  */
 190
 191   SET_USE (op_p, new_name);
 192 }
 193
 194
 195 /* Renames the variables in basic block BB.  */
 196
 197 static void
 198 rename_variables_in_bb (basic_block bb)
 199 {
 200   tree phi;
 201   block_stmt_iterator bsi;
 202   tree stmt;
 203   use_operand_p use_p;
 204   ssa_op_iter iter;
 205   edge e;
 206   edge_iterator ei;
 207   struct loop *loop = bb->loop_father;
 208
 209   for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
 210     {
 211       stmt = bsi_stmt (bsi);
 212       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 213         rename_use_op (use_p);
 214     }
 215
 216   FOR_EACH_EDGE (e, ei, bb->succs)
 217     {
 218       if (!flow_bb_inside_loop_p (loop, e->dest))
 219         continue;
 220       for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
 221         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
 222     }
 223 }
 224
 225
 226 /* Renames variables in new generated LOOP.  */
 227
 228 void
 229 rename_variables_in_loop (struct loop *loop)
 230 {
 231   unsigned i;
 232   basic_block *bbs;
 233
 234   bbs = get_loop_body (loop);
 235
 236   for (i = 0; i < loop->num_nodes; i++)
 237     rename_variables_in_bb (bbs[i]);
 238
 239   free (bbs);
 240 }
 241
 242
 243 /* Update the PHI nodes of NEW_LOOP.
 244
 245    NEW_LOOP is a duplicate of ORIG_LOOP.
 246    AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
 247    AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
 248    executes before it.  */
 249
 250 static void
 251 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
 252                                        struct loop *new_loop, bool after)
 253 {
 254   tree new_ssa_name;
 255   tree phi_new, phi_orig;
 256   tree def;
 257   edge orig_loop_latch = loop_latch_edge (orig_loop);
 258   edge orig_entry_e = loop_preheader_edge (orig_loop);
 259   edge new_loop_exit_e = single_exit (new_loop);
 260   edge new_loop_entry_e = loop_preheader_edge (new_loop);
 261   edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
 262
 263   /*
 264      step 1. For each loop-header-phi:
 265              Add the first phi argument for the phi in NEW_LOOP
 266             (the one associated with the entry of NEW_LOOP)
 267
 268      step 2. For each loop-header-phi:
 269              Add the second phi argument for the phi in NEW_LOOP
 270             (the one associated with the latch of NEW_LOOP)
 271
 272      step 3. Update the phis in the successor block of NEW_LOOP.
 273
 274         case 1: NEW_LOOP was placed before ORIG_LOOP:
 275                 The successor block of NEW_LOOP is the header of ORIG_LOOP.
 276                 Updating the phis in the successor block can therefore be done
 277                 along with the scanning of the loop header phis, because the
 278                 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
 279                 phi nodes, organized in the same order.
 280
 281         case 2: NEW_LOOP was placed after ORIG_LOOP:
 282                 The successor block of NEW_LOOP is the original exit block of
 283                 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
 284                 We postpone updating these phis to a later stage (when
 285                 loop guards are added).
 286    */
 287
 288
 289   /* Scan the phis in the headers of the old and new loops
 290      (they are organized in exactly the same order).  */
 291
 292   for (phi_new = phi_nodes (new_loop->header),
 293        phi_orig = phi_nodes (orig_loop->header);
 294        phi_new && phi_orig;
 295        phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
 296     {
 297       /* step 1.  */
 298       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
 299       add_phi_arg (phi_new, def, new_loop_entry_e);
 300
 301       /* step 2.  */
 302       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
 303       if (TREE_CODE (def) != SSA_NAME)
 304         continue;
 305
 306       new_ssa_name = get_current_def (def);
 307       if (!new_ssa_name)
 308         {
 309           /* This only happens if there are no definitions
 310              inside the loop. use the phi_result in this case.  */
 311           new_ssa_name = PHI_RESULT (phi_new);
 312         }
 313
 314       /* An ordinary ssa name defined in the loop.  */
 315       add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
 316
 317       /* step 3 (case 1).  */
 318       if (!after)
 319         {
 320           gcc_assert (new_loop_exit_e == orig_entry_e);
 321           SET_PHI_ARG_DEF (phi_orig,
 322                            new_loop_exit_e->dest_idx,
 323                            new_ssa_name);
 324         }
 325     }
 326 }
 327
 328
 329 /* Update PHI nodes for a guard of the LOOP.
 330
 331    Input:
 332    - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
 333         controls whether LOOP is to be executed.  GUARD_EDGE is the edge that
 334         originates from the guard-bb, skips LOOP and reaches the (unique) exit
 335         bb of LOOP.  This loop-exit-bb is an empty bb with one successor.
 336         We denote this bb NEW_MERGE_BB because before the guard code was added
 337         it had a single predecessor (the LOOP header), and now it became a merge
 338         point of two paths - the path that ends with the LOOP exit-edge, and
 339         the path that ends with GUARD_EDGE.
 340    - NEW_EXIT_BB: New basic block that is added by this function between LOOP
 341         and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
 342
 343    ===> The CFG before the guard-code was added:
 344         LOOP_header_bb:
 345           loop_body
 346           if (exit_loop) goto update_bb
 347           else           goto LOOP_header_bb
 348         update_bb:
 349
 350    ==> The CFG after the guard-code was added:
 351         guard_bb:
 352           if (LOOP_guard_condition) goto new_merge_bb
 353           else                      goto LOOP_header_bb
 354         LOOP_header_bb:
 355           loop_body
 356           if (exit_loop_condition) goto new_merge_bb
 357           else                     goto LOOP_header_bb
 358         new_merge_bb:
 359           goto update_bb
 360         update_bb:
 361
 362    ==> The CFG after this function:
 363         guard_bb:
 364           if (LOOP_guard_condition) goto new_merge_bb
 365           else                      goto LOOP_header_bb
 366         LOOP_header_bb:
 367           loop_body
 368           if (exit_loop_condition) goto new_exit_bb
 369           else                     goto LOOP_header_bb
 370         new_exit_bb:
 371         new_merge_bb:
 372           goto update_bb
 373         update_bb:
 374
 375    This function:
 376    1. creates and updates the relevant phi nodes to account for the new
 377       incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
 378       1.1. Create phi nodes at NEW_MERGE_BB.
 379       1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
 380            UPDATE_BB).  UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
 381    2. preserves loop-closed-ssa-form by creating the required phi nodes
 382       at the exit of LOOP (i.e, in NEW_EXIT_BB).
 383
 384    There are two flavors to this function:
 385
 386    slpeel_update_phi_nodes_for_guard1:
 387      Here the guard controls whether we enter or skip LOOP, where LOOP is a
 388      prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
 389      for variables that have phis in the loop header.
 390
 391    slpeel_update_phi_nodes_for_guard2:
 392      Here the guard controls whether we enter or skip LOOP, where LOOP is an
 393      epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
 394      for variables that have phis in the loop exit.
 395
 396    I.E., the overall structure is:
 397
 398         loop1_preheader_bb:
 399                 guard1 (goto loop1/merg1_bb)
 400         loop1
 401         loop1_exit_bb:
 402                 guard2 (goto merge1_bb/merge2_bb)
 403         merge1_bb
 404         loop2
 405         loop2_exit_bb
 406         merge2_bb
 407         next_bb
 408
 409    slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
 410    loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
 411    that have phis in loop1->header).
 412
 413    slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
 414    loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
 415    that have phis in next_bb). It also adds some of these phis to
 416    loop1_exit_bb.
 417
 418    slpeel_update_phi_nodes_for_guard1 is always called before
 419    slpeel_update_phi_nodes_for_guard2. They are both needed in order
 420    to create correct data-flow and loop-closed-ssa-form.
 421
 422    Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
 423    that change between iterations of a loop (and therefore have a phi-node
 424    at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
 425    phis for variables that are used out of the loop (and therefore have
 426    loop-closed exit phis). Some variables may be both updated between
 427    iterations and used after the loop. This is why in loop1_exit_bb we
 428    may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
 429    and exit phis (created by slpeel_update_phi_nodes_for_guard2).
 430
 431    - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
 432      an original loop. i.e., we have:
 433
 434            orig_loop
 435            guard_bb (goto LOOP/new_merge)
 436            new_loop <-- LOOP
 437            new_exit
 438            new_merge
 439            next_bb
 440
 441      If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
 442      have:
 443
 444            new_loop
 445            guard_bb (goto LOOP/new_merge)
 446            orig_loop <-- LOOP
 447            new_exit
 448            new_merge
 449            next_bb
 450
 451      The SSA names defined in the original loop have a current
 452      reaching definition that that records the corresponding new
 453      ssa-name used in the new duplicated loop copy.
 454   */
 455
 456 /* Function slpeel_update_phi_nodes_for_guard1
 457
 458    Input:
 459    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 460    - DEFS - a bitmap of ssa names to mark new names for which we recorded
 461             information.
 462
 463    In the context of the overall structure, we have:
 464
 465         loop1_preheader_bb:
 466                 guard1 (goto loop1/merg1_bb)
 467 LOOP->  loop1
 468         loop1_exit_bb:
 469                 guard2 (goto merge1_bb/merge2_bb)
 470         merge1_bb
 471         loop2
 472         loop2_exit_bb
 473         merge2_bb
 474         next_bb
 475
 476    For each name updated between loop iterations (i.e - for each name that has
 477    an entry (loop-header) phi in LOOP) we create a new phi in:
 478    1. merge1_bb (to account for the edge from guard1)
 479    2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 480 */
 481
 482 static void
 483 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
 484                                     bool is_new_loop, basic_block *new_exit_bb,
 485                                     bitmap *defs)
 486 {
 487   tree orig_phi, new_phi;
 488   tree update_phi, update_phi2;
 489   tree guard_arg, loop_arg;
 490   basic_block new_merge_bb = guard_edge->dest;
 491   edge e = EDGE_SUCC (new_merge_bb, 0);
 492   basic_block update_bb = e->dest;
 493   basic_block orig_bb = loop->header;
 494   edge new_exit_e;
 495   tree current_new_name;
 496   tree name;
 497
 498   /* Create new bb between loop and new_merge_bb.  */
 499   *new_exit_bb = split_edge (single_exit (loop));
 500
 501   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 502
 503   for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
 504        orig_phi && update_phi;
 505        orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
 506     {
 507       /* Virtual phi; Mark it for renaming. We actually want to call
 508          mar_sym_for_renaming, but since all ssa renaming datastructures
 509          are going to be freed before we get to call ssa_upate, we just
 510          record this name for now in a bitmap, and will mark it for
 511          renaming later.  */
 512       name = PHI_RESULT (orig_phi);
 513       if (!is_gimple_reg (SSA_NAME_VAR (name)))
 514         bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
 515
 516       /** 1. Handle new-merge-point phis  **/
 517
 518       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 519       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 520                                  new_merge_bb);
 521
 522       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 523             of LOOP. Set the two phi args in NEW_PHI for these edges:  */
 524       loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
 525       guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
 526
 527       add_phi_arg (new_phi, loop_arg, new_exit_e);
 528       add_phi_arg (new_phi, guard_arg, guard_edge);
 529
 530       /* 1.3. Update phi in successor block.  */
 531       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
 532                   || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
 533       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 534       update_phi2 = new_phi;
 535
 536
 537       /** 2. Handle loop-closed-ssa-form phis  **/
 538
 539       if (!is_gimple_reg (PHI_RESULT (orig_phi)))
 540         continue;
 541
 542       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 543       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 544                                  *new_exit_bb);
 545
 546       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 547       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 548
 549       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 550       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 551       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 552
 553       /* 2.4. Record the newly created name with set_current_def.
 554          We want to find a name such that
 555                 name = get_current_def (orig_loop_name)
 556          and to set its current definition as follows:
 557                 set_current_def (name, new_phi_name)
 558
 559          If LOOP is a new loop then loop_arg is already the name we're
 560          looking for. If LOOP is the original loop, then loop_arg is
 561          the orig_loop_name and the relevant name is recorded in its
 562          current reaching definition.  */
 563       if (is_new_loop)
 564         current_new_name = loop_arg;
 565       else
 566         {
 567           current_new_name = get_current_def (loop_arg);
 568           /* current_def is not available only if the variable does not
 569              change inside the loop, in which case we also don't care
 570              about recording a current_def for it because we won't be
 571              trying to create loop-exit-phis for it.  */
 572           if (!current_new_name)
 573             continue;
 574         }
 575       gcc_assert (get_current_def (current_new_name) == NULL_TREE);
 576
 577       set_current_def (current_new_name, PHI_RESULT (new_phi));
 578       bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
 579     }
 580
 581   set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
 582 }
 583
 584
 585 /* Function slpeel_update_phi_nodes_for_guard2
 586
 587    Input:
 588    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 589
 590    In the context of the overall structure, we have:
 591
 592         loop1_preheader_bb:
 593                 guard1 (goto loop1/merg1_bb)
 594         loop1
 595         loop1_exit_bb:
 596                 guard2 (goto merge1_bb/merge2_bb)
 597         merge1_bb
 598 LOOP->  loop2
 599         loop2_exit_bb
 600         merge2_bb
 601         next_bb
 602
 603    For each name used out side the loop (i.e - for each name that has an exit
 604    phi in next_bb) we create a new phi in:
 605    1. merge2_bb (to account for the edge from guard_bb)
 606    2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 607    3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
 608       if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
 609 */
 610
 611 static void
 612 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
 613                                     bool is_new_loop, basic_block *new_exit_bb)
 614 {
 615   tree orig_phi, new_phi;
 616   tree update_phi, update_phi2;
 617   tree guard_arg, loop_arg;
 618   basic_block new_merge_bb = guard_edge->dest;
 619   edge e = EDGE_SUCC (new_merge_bb, 0);
 620   basic_block update_bb = e->dest;
 621   edge new_exit_e;
 622   tree orig_def, orig_def_new_name;
 623   tree new_name, new_name2;
 624   tree arg;
 625
 626   /* Create new bb between loop and new_merge_bb.  */
 627   *new_exit_bb = split_edge (single_exit (loop));
 628
 629   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 630
 631   for (update_phi = phi_nodes (update_bb); update_phi;
 632        update_phi = PHI_CHAIN (update_phi))
 633     {
 634       orig_phi = update_phi;
 635       orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
 636       /* This loop-closed-phi actually doesn't represent a use
 637          out of the loop - the phi arg is a constant.  */
 638       if (TREE_CODE (orig_def) != SSA_NAME)
 639         continue;
 640       orig_def_new_name = get_current_def (orig_def);
 641       arg = NULL_TREE;
 642
 643       /** 1. Handle new-merge-point phis  **/
 644
 645       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 646       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 647                                  new_merge_bb);
 648
 649       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 650             of LOOP. Set the two PHI args in NEW_PHI for these edges:  */
 651       new_name = orig_def;
 652       new_name2 = NULL_TREE;
 653       if (orig_def_new_name)
 654         {
 655           new_name = orig_def_new_name;
 656           /* Some variables have both loop-entry-phis and loop-exit-phis.
 657              Such variables were given yet newer names by phis placed in
 658              guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
 659              new_name2 = get_current_def (get_current_def (orig_name)).  */
 660           new_name2 = get_current_def (new_name);
 661         }
 662
 663       if (is_new_loop)
 664         {
 665           guard_arg = orig_def;
 666           loop_arg = new_name;
 667         }
 668       else
 669         {
 670           guard_arg = new_name;
 671           loop_arg = orig_def;
 672         }
 673       if (new_name2)
 674         guard_arg = new_name2;
 675
 676       add_phi_arg (new_phi, loop_arg, new_exit_e);
 677       add_phi_arg (new_phi, guard_arg, guard_edge);
 678
 679       /* 1.3. Update phi in successor block.  */
 680       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
 681       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 682       update_phi2 = new_phi;
 683
 684
 685       /** 2. Handle loop-closed-ssa-form phis  **/
 686
 687       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 688       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 689                                  *new_exit_bb);
 690
 691       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 692       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 693
 694       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 695       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 696       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 697
 698
 699       /** 3. Handle loop-closed-ssa-form phis for first loop  **/
 700
 701       /* 3.1. Find the relevant names that need an exit-phi in
 702          GUARD_BB, i.e. names for which
 703          slpeel_update_phi_nodes_for_guard1 had not already created a
 704          phi node. This is the case for names that are used outside
 705          the loop (and therefore need an exit phi) but are not updated
 706          across loop iterations (and therefore don't have a
 707          loop-header-phi).
 708
 709          slpeel_update_phi_nodes_for_guard1 is responsible for
 710          creating loop-exit phis in GUARD_BB for names that have a
 711          loop-header-phi.  When such a phi is created we also record
 712          the new name in its current definition.  If this new name
 713          exists, then guard_arg was set to this new name (see 1.2
 714          above).  Therefore, if guard_arg is not this new name, this
 715          is an indication that an exit-phi in GUARD_BB was not yet
 716          created, so we take care of it here.  */
 717       if (guard_arg == new_name2)
 718         continue;
 719       arg = guard_arg;
 720
 721       /* 3.2. Generate new phi node in GUARD_BB:  */
 722       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 723                                  guard_edge->src);
 724
 725       /* 3.3. GUARD_BB has one incoming edge:  */
 726       gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
 727       add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
 728
 729       /* 3.4. Update phi in successor of GUARD_BB:  */
 730       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
 731                                                                 == guard_arg);
 732       SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
 733     }
 734
 735   set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
 736 }
 737
 738
 739 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
 740    that starts at zero, increases by one and its limit is NITERS.
 741
 742    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 743
 744 void
 745 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
 746 {
 747   tree indx_before_incr, indx_after_incr, cond_stmt, cond;
 748   tree orig_cond;
 749   edge exit_edge = single_exit (loop);
 750   block_stmt_iterator loop_cond_bsi;
 751   block_stmt_iterator incr_bsi;
 752   bool insert_after;
 753   tree init = build_int_cst (TREE_TYPE (niters), 0);
 754   tree step = build_int_cst (TREE_TYPE (niters), 1);
 755   LOC loop_loc;
 756
 757   orig_cond = get_loop_exit_condition (loop);
 758   gcc_assert (orig_cond);
 759   loop_cond_bsi = bsi_for_stmt (orig_cond);
 760
 761   standard_iv_increment_position (loop, &incr_bsi, &insert_after);
 762   create_iv (init, step, NULL_TREE, loop,
 763              &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
 764
 765   if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop.  */
 766     cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
 767   else /* 'then' edge loops back.  */
 768     cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
 769
 770   cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
 771                       NULL_TREE, NULL_TREE);
 772   bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
 773
 774   /* Remove old loop exit test:  */
 775   bsi_remove (&loop_cond_bsi, true);
 776
 777   loop_loc = find_loop_location (loop);
 778   if (dump_file && (dump_flags & TDF_DETAILS))
 779     {
 780       if (loop_loc != UNKNOWN_LOC)
 781         fprintf (dump_file, "\nloop at %s:%d: ",
 782                  LOC_FILE (loop_loc), LOC_LINE (loop_loc));
 783       print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
 784     }
 785
 786   loop->nb_iterations = niters;
 787 }
 788
 789
 790 /* Given LOOP this function generates a new copy of it and puts it
 791    on E which is either the entry or exit of LOOP.  */
 792
 793 struct loop *
 794 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
 795 {
 796   struct loop *new_loop;
 797   basic_block *new_bbs, *bbs;
 798   bool at_exit;
 799   bool was_imm_dom;
 800   basic_block exit_dest;
 801   tree phi, phi_arg;
 802   edge exit, new_exit;
 803
 804   at_exit = (e == single_exit (loop));
 805   if (!at_exit && e != loop_preheader_edge (loop))
 806     return NULL;
 807
 808   bbs = get_loop_body (loop);
 809
 810   /* Check whether duplication is possible.  */
 811   if (!can_copy_bbs_p (bbs, loop->num_nodes))
 812     {
 813       free (bbs);
 814       return NULL;
 815     }
 816
 817   /* Generate new loop structure.  */
 818   new_loop = duplicate_loop (loop, loop_outer (loop));
 819   if (!new_loop)
 820     {
 821       free (bbs);
 822       return NULL;
 823     }
 824
 825   exit_dest = single_exit (loop)->dest;
 826   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
 827                                           exit_dest) == loop->header ?
 828                  true : false);
 829
 830   new_bbs = XNEWVEC (basic_block, loop->num_nodes);
 831
 832   exit = single_exit (loop);
 833   copy_bbs (bbs, loop->num_nodes, new_bbs,
 834             &exit, 1, &new_exit, NULL,
 835             e->src);
 836
 837   /* Duplicating phi args at exit bbs as coming
 838      also from exit of duplicated loop.  */
 839   for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
 840     {
 841       phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
 842       if (phi_arg)
 843         {
 844           edge new_loop_exit_edge;
 845
 846           if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
 847             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
 848           else
 849             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
 850
 851           add_phi_arg (phi, phi_arg, new_loop_exit_edge);
 852         }
 853     }
 854
 855   if (at_exit) /* Add the loop copy at exit.  */
 856     {
 857       redirect_edge_and_branch_force (e, new_loop->header);
 858       PENDING_STMT (e) = NULL;
 859       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
 860       if (was_imm_dom)
 861         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
 862     }
 863   else /* Add the copy at entry.  */
 864     {
 865       edge new_exit_e;
 866       edge entry_e = loop_preheader_edge (loop);
 867       basic_block preheader = entry_e->src;
 868
 869       if (!flow_bb_inside_loop_p (new_loop,
 870                                   EDGE_SUCC (new_loop->header, 0)->dest))
 871         new_exit_e = EDGE_SUCC (new_loop->header, 0);
 872       else
 873         new_exit_e = EDGE_SUCC (new_loop->header, 1);
 874
 875       redirect_edge_and_branch_force (new_exit_e, loop->header);
 876       PENDING_STMT (new_exit_e) = NULL;
 877       set_immediate_dominator (CDI_DOMINATORS, loop->header,
 878                                new_exit_e->src);
 879
 880       /* We have to add phi args to the loop->header here as coming
 881          from new_exit_e edge.  */
 882       for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
 883         {
 884           phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
 885           if (phi_arg)
 886             add_phi_arg (phi, phi_arg, new_exit_e);
 887         }
 888
 889       redirect_edge_and_branch_force (entry_e, new_loop->header);
 890       PENDING_STMT (entry_e) = NULL;
 891       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
 892     }
 893
 894   free (new_bbs);
 895   free (bbs);
 896
 897   return new_loop;
 898 }
 899
 900
 901 /* Given the condition statement COND, put it as the last statement
 902    of GUARD_BB; EXIT_BB is the basic block to skip the loop;
 903    Assumes that this is the single exit of the guarded loop.
 904    Returns the skip edge.  */
 905
 906 static edge
 907 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
 908                        basic_block dom_bb)
 909 {
 910   block_stmt_iterator bsi;
 911   edge new_e, enter_e;
 912   tree cond_stmt;
 913   tree gimplify_stmt_list;
 914
 915   enter_e = EDGE_SUCC (guard_bb, 0);
 916   enter_e->flags &= ~EDGE_FALLTHRU;
 917   enter_e->flags |= EDGE_FALSE_VALUE;
 918   bsi = bsi_last (guard_bb);
 919
 920   cond =
 921     force_gimple_operand (cond, &gimplify_stmt_list, true,
 922                           NULL_TREE);
 923   cond_stmt = build3 (COND_EXPR, void_type_node, cond,
 924                       NULL_TREE, NULL_TREE);
 925   if (gimplify_stmt_list)
 926     bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
 927
 928   bsi = bsi_last (guard_bb);
 929   bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
 930
 931   /* Add new edge to connect guard block to the merge/loop-exit block.  */
 932   new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
 933   set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
 934   return new_e;
 935 }
 936
 937
 938 /* This function verifies that the following restrictions apply to LOOP:
 939    (1) it is innermost
 940    (2) it consists of exactly 2 basic blocks - header, and an empty latch.
 941    (3) it is single entry, single exit
 942    (4) its exit condition is the last stmt in the header
 943    (5) E is the entry/exit edge of LOOP.
 944  */
 945
 946 bool
 947 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
 948 {
 949   edge exit_e = single_exit (loop);
 950   edge entry_e = loop_preheader_edge (loop);
 951   tree orig_cond = get_loop_exit_condition (loop);
 952   block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
 953
 954   if (need_ssa_update_p ())
 955     return false;
 956
 957   if (loop->inner
 958       /* All loops have an outer scope; the only case loop->outer is NULL is for
 959          the function itself.  */
 960       || !loop_outer (loop)
 961       || loop->num_nodes != 2
 962       || !empty_block_p (loop->latch)
 963       || !single_exit (loop)
 964       /* Verify that new loop exit condition can be trivially modified.  */
 965       || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
 966       || (e != exit_e && e != entry_e))
 967     return false;
 968
 969   return true;
 970 }
 971
 972 #ifdef ENABLE_CHECKING
 973 void
 974 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
 975                                  struct loop *second_loop)
 976 {
 977   basic_block loop1_exit_bb = single_exit (first_loop)->dest;
 978   basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
 979   basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
 980
 981   /* A guard that controls whether the second_loop is to be executed or skipped
 982      is placed in first_loop->exit.  first_loopt->exit therefore has two
 983      successors - one is the preheader of second_loop, and the other is a bb
 984      after second_loop.
 985    */
 986   gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
 987
 988   /* 1. Verify that one of the successors of first_loopt->exit is the preheader
 989         of second_loop.  */
 990
 991   /* The preheader of new_loop is expected to have two predecessors:
 992      first_loop->exit and the block that precedes first_loop.  */
 993
 994   gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
 995               && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
 996                    && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
 997                || (EDGE_PRED (loop2_entry_bb, 1)->src ==  loop1_exit_bb
 998                    && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
 999
1000   /* Verify that the other successor of first_loopt->exit is after the
1001      second_loop.  */
1002   /* TODO */
1003 }
1004 #endif
1005
1006 /* If the run time cost model check determines that vectorization is
1007    not profitable and hence scalar loop should be generated then set
1008    FIRST_NITERS to prologue peeled iterations. This will allow all the
1009    iterations to be executed in the prologue peeled scalar loop.  */
1010
1011 void
1012 set_prologue_iterations (basic_block bb_before_first_loop,
1013                          tree first_niters,
1014                          struct loop *loop,
1015                          unsigned int th)
1016 {
1017   edge e;
1018   basic_block cond_bb, then_bb;
1019   tree var, prologue_after_cost_adjust_name, stmt;
1020   block_stmt_iterator bsi;
1021   tree newphi;
1022   edge e_true, e_false, e_fallthru;
1023   tree cond_stmt;
1024   tree gimplify_stmt_list;
1025   tree cost_pre_condition = NULL_TREE;
1026   tree scalar_loop_iters =
1027     unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1028
1029   e = single_pred_edge (bb_before_first_loop);
1030   cond_bb = split_edge(e);
1031
1032   e = single_pred_edge (bb_before_first_loop);
1033   then_bb = split_edge(e);
1034   set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1035
1036   e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1037                                    EDGE_FALSE_VALUE);
1038   set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1039
1040   e_true = EDGE_PRED (then_bb, 0);
1041   e_true->flags &= ~EDGE_FALLTHRU;
1042   e_true->flags |= EDGE_TRUE_VALUE;
1043
1044   e_fallthru = EDGE_SUCC (then_bb, 0);
1045
1046   cost_pre_condition =
1047     build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1048             build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1049   cost_pre_condition =
1050     force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1051                           true, NULL_TREE);
1052   cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1053                       NULL_TREE, NULL_TREE);
1054
1055   bsi = bsi_last (cond_bb);
1056   if (gimplify_stmt_list)
1057     bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1058
1059   bsi = bsi_last (cond_bb);
1060   bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1061
1062   var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1063                         "prologue_after_cost_adjust");
1064   add_referenced_var (var);
1065   prologue_after_cost_adjust_name =
1066     force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1067
1068   bsi = bsi_last (then_bb);
1069   if (stmt)
1070     bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1071
1072   newphi = create_phi_node (var, bb_before_first_loop);
1073   add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1074   add_phi_arg (newphi, first_niters, e_false);
1075
1076   first_niters = PHI_RESULT (newphi);
1077 }
1078
1079
1080 /* Function slpeel_tree_peel_loop_to_edge.
1081
1082    Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1083    that is placed on the entry (exit) edge E of LOOP. After this transformation
1084    we have two loops one after the other - first-loop iterates FIRST_NITERS
1085    times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1086    If the cost model indicates that it is profitable to emit a scalar
1087    loop instead of the vector one, then the prolog (epilog) loop will iterate
1088    for the entire unchanged scalar iterations of the loop.
1089
1090    Input:
1091    - LOOP: the loop to be peeled.
1092    - E: the exit or entry edge of LOOP.
1093         If it is the entry edge, we peel the first iterations of LOOP. In this
1094         case first-loop is LOOP, and second-loop is the newly created loop.
1095         If it is the exit edge, we peel the last iterations of LOOP. In this
1096         case, first-loop is the newly created loop, and second-loop is LOOP.
1097    - NITERS: the number of iterations that LOOP iterates.
1098    - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1099    - UPDATE_FIRST_LOOP_COUNT:  specified whether this function is responsible
1100         for updating the loop bound of the first-loop to FIRST_NITERS.  If it
1101         is false, the caller of this function may want to take care of this
1102         (this can be useful if we don't want new stmts added to first-loop).
1103    - TH: cost model profitability threshold of iterations for vectorization.
1104    - CHECK_PROFITABILITY: specify whether cost model check has not occured
1105                           during versioning and hence needs to occur during
1106                           prologue generation or whether cost model check
1107                           has not occured during prologue generation and hence
1108                           needs to occur during epilogue generation.
1109
1110
1111    Output:
1112    The function returns a pointer to the new loop-copy, or NULL if it failed
1113    to perform the transformation.
1114
1115    The function generates two if-then-else guards: one before the first loop,
1116    and the other before the second loop:
1117    The first guard is:
1118      if (FIRST_NITERS == 0) then skip the first loop,
1119      and go directly to the second loop.
1120    The second guard is:
1121      if (FIRST_NITERS == NITERS) then skip the second loop.
1122
1123    FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1124    FORNOW the resulting code will not be in loop-closed-ssa form.
1125 */
1126
1127 struct loop*
1128 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1129                                edge e, tree first_niters,
1130                                tree niters, bool update_first_loop_count,
1131                                unsigned int th, bool check_profitability)
1132 {
1133   struct loop *new_loop = NULL, *first_loop, *second_loop;
1134   edge skip_e;
1135   tree pre_condition = NULL_TREE;
1136   bitmap definitions;
1137   basic_block bb_before_second_loop, bb_after_second_loop;
1138   basic_block bb_before_first_loop;
1139   basic_block bb_between_loops;
1140   basic_block new_exit_bb;
1141   edge exit_e = single_exit (loop);
1142   LOC loop_loc;
1143   tree cost_pre_condition = NULL_TREE;
1144
1145   if (!slpeel_can_duplicate_loop_p (loop, e))
1146     return NULL;
1147
1148   /* We have to initialize cfg_hooks. Then, when calling
1149    cfg_hooks->split_edge, the function tree_split_edge
1150    is actually called and, when calling cfg_hooks->duplicate_block,
1151    the function tree_duplicate_bb is called.  */
1152   tree_register_cfg_hooks ();
1153
1154
1155   /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1156         Resulting CFG would be:
1157
1158         first_loop:
1159         do {
1160         } while ...
1161
1162         second_loop:
1163         do {
1164         } while ...
1165
1166         orig_exit_bb:
1167    */
1168
1169   if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1170     {
1171       loop_loc = find_loop_location (loop);
1172       if (dump_file && (dump_flags & TDF_DETAILS))
1173         {
1174           if (loop_loc != UNKNOWN_LOC)
1175             fprintf (dump_file, "\n%s:%d: note: ",
1176                      LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1177           fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1178         }
1179       return NULL;
1180     }
1181
1182   if (e == exit_e)
1183     {
1184       /* NEW_LOOP was placed after LOOP.  */
1185       first_loop = loop;
1186       second_loop = new_loop;
1187     }
1188   else
1189     {
1190       /* NEW_LOOP was placed before LOOP.  */
1191       first_loop = new_loop;
1192       second_loop = loop;
1193     }
1194
1195   definitions = ssa_names_to_replace ();
1196   slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1197   rename_variables_in_loop (new_loop);
1198
1199
1200   /* 2.  Add the guard code in one of the following ways:
1201
1202      2.a Add the guard that controls whether the first loop is executed.
1203          This occurs when this function is invoked for prologue or epilogiue
1204          generation and when the cost model check can be done at compile time.
1205
1206          Resulting CFG would be:
1207
1208          bb_before_first_loop:
1209          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1210                                 GOTO first-loop
1211
1212          first_loop:
1213          do {
1214          } while ...
1215
1216          bb_before_second_loop:
1217
1218          second_loop:
1219          do {
1220          } while ...
1221
1222          orig_exit_bb:
1223
1224      2.b Add the cost model check that allows the prologue
1225          to iterate for the entire unchanged scalar
1226          iterations of the loop in the event that the cost
1227          model indicates that the scalar loop is more
1228          profitable than the vector one. This occurs when
1229          this function is invoked for prologue generation
1230          and the cost model check needs to be done at run
1231          time.
1232
1233          Resulting CFG after prologue peeling would be:
1234
1235          if (scalar_loop_iterations <= th)
1236            FIRST_NITERS = scalar_loop_iterations
1237
1238          bb_before_first_loop:
1239          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1240                                 GOTO first-loop
1241
1242          first_loop:
1243          do {
1244          } while ...
1245
1246          bb_before_second_loop:
1247
1248          second_loop:
1249          do {
1250          } while ...
1251
1252          orig_exit_bb:
1253
1254      2.c Add the cost model check that allows the epilogue
1255          to iterate for the entire unchanged scalar
1256          iterations of the loop in the event that the cost
1257          model indicates that the scalar loop is more
1258          profitable than the vector one. This occurs when
1259          this function is invoked for epilogue generation
1260          and the cost model check needs to be done at run
1261          time.
1262
1263          Resulting CFG after prologue peeling would be:
1264
1265          bb_before_first_loop:
1266          if ((scalar_loop_iterations <= th)
1267              ||
1268              FIRST_NITERS == 0) GOTO bb_before_second_loop
1269                                 GOTO first-loop
1270
1271          first_loop:
1272          do {
1273          } while ...
1274
1275          bb_before_second_loop:
1276
1277          second_loop:
1278          do {
1279          } while ...
1280
1281          orig_exit_bb:
1282   */
1283
1284   bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1285   bb_before_second_loop = split_edge (single_exit (first_loop));
1286
1287   /* Epilogue peeling.  */
1288   if (!update_first_loop_count)
1289     {
1290       pre_condition =
1291         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1292                      build_int_cst (TREE_TYPE (first_niters), 0));
1293       if (check_profitability)
1294         {
1295           tree scalar_loop_iters
1296             = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1297                                         (loop_vec_info_for_loop (loop)));
1298           cost_pre_condition =
1299             build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1300                     build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1301
1302           pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1303                                        cost_pre_condition, pre_condition);
1304         }
1305     }
1306
1307   /* Prologue peeling.  */
1308   else
1309     {
1310       if (check_profitability)
1311         set_prologue_iterations (bb_before_first_loop, first_niters,
1312                                  loop, th);
1313
1314       pre_condition =
1315         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1316                      build_int_cst (TREE_TYPE (first_niters), 0));
1317     }
1318
1319   skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1320                                   bb_before_second_loop, bb_before_first_loop);
1321   slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1322                                       first_loop == new_loop,
1323                                       &new_exit_bb, &definitions);
1324
1325
1326   /* 3. Add the guard that controls whether the second loop is executed.
1327         Resulting CFG would be:
1328
1329         bb_before_first_loop:
1330         if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1331                                GOTO first-loop
1332
1333         first_loop:
1334         do {
1335         } while ...
1336
1337         bb_between_loops:
1338         if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1339                                     GOTO bb_before_second_loop
1340
1341         bb_before_second_loop:
1342
1343         second_loop:
1344         do {
1345         } while ...
1346
1347         bb_after_second_loop:
1348
1349         orig_exit_bb:
1350    */
1351
1352   bb_between_loops = new_exit_bb;
1353   bb_after_second_loop = split_edge (single_exit (second_loop));
1354
1355   pre_condition =
1356         fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1357   skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1358                                   bb_after_second_loop, bb_before_first_loop);
1359   slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1360                                      second_loop == new_loop, &new_exit_bb);
1361
1362   /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1363    */
1364   if (update_first_loop_count)
1365     slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1366
1367   BITMAP_FREE (definitions);
1368   delete_update_ssa ();
1369
1370   return new_loop;
1371 }
1372
1373 /* Function vect_get_loop_location.
1374
1375    Extract the location of the loop in the source code.
1376    If the loop is not well formed for vectorization, an estimated
1377    location is calculated.
1378    Return the loop location if succeed and NULL if not.  */
1379
1380 LOC
1381 find_loop_location (struct loop *loop)
1382 {
1383   tree node = NULL_TREE;
1384   basic_block bb;
1385   block_stmt_iterator si;
1386
1387   if (!loop)
1388     return UNKNOWN_LOC;
1389
1390   node = get_loop_exit_condition (loop);
1391
1392   if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1393       && EXPR_FILENAME (node) && EXPR_LINENO (node))
1394     return EXPR_LOC (node);
1395
1396   /* If we got here the loop is probably not "well formed",
1397      try to estimate the loop location */
1398
1399   if (!loop->header)
1400     return UNKNOWN_LOC;
1401
1402   bb = loop->header;
1403
1404   for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1405     {
1406       node = bsi_stmt (si);
1407       if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1408         return EXPR_LOC (node);
1409     }
1410
1411   return UNKNOWN_LOC;
1412 }
1413
1414
1415 /*************************************************************************
1416   Vectorization Debug Information.
1417  *************************************************************************/
1418
1419 /* Function vect_set_verbosity_level.
1420
1421    Called from toplev.c upon detection of the
1422    -ftree-vectorizer-verbose=N option.  */
1423
1424 void
1425 vect_set_verbosity_level (const char *val)
1426 {
1427    unsigned int vl;
1428
1429    vl = atoi (val);
1430    if (vl < MAX_VERBOSITY_LEVEL)
1431      vect_verbosity_level = vl;
1432    else
1433      vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1434 }
1435
1436
1437 /* Function vect_set_dump_settings.
1438
1439    Fix the verbosity level of the vectorizer if the
1440    requested level was not set explicitly using the flag
1441    -ftree-vectorizer-verbose=N.
1442    Decide where to print the debugging information (dump_file/stderr).
1443    If the user defined the verbosity level, but there is no dump file,
1444    print to stderr, otherwise print to the dump file.  */
1445
1446 static void
1447 vect_set_dump_settings (void)
1448 {
1449   vect_dump = dump_file;
1450
1451   /* Check if the verbosity level was defined by the user:  */
1452   if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1453     {
1454       /* If there is no dump file, print to stderr.  */
1455       if (!dump_file)
1456         vect_dump = stderr;
1457       return;
1458     }
1459
1460   /* User didn't specify verbosity level:  */
1461   if (dump_file && (dump_flags & TDF_DETAILS))
1462     vect_verbosity_level = REPORT_DETAILS;
1463   else if (dump_file && (dump_flags & TDF_STATS))
1464     vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1465   else
1466     vect_verbosity_level = REPORT_NONE;
1467
1468   gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1469 }
1470
1471
1472 /* Function debug_loop_details.
1473
1474    For vectorization debug dumps.  */
1475
1476 bool
1477 vect_print_dump_info (enum verbosity_levels vl)
1478 {
1479   if (vl > vect_verbosity_level)
1480     return false;
1481
1482   if (!current_function_decl || !vect_dump)
1483     return false;
1484
1485   if (vect_loop_location == UNKNOWN_LOC)
1486     fprintf (vect_dump, "\n%s:%d: note: ",
1487              DECL_SOURCE_FILE (current_function_decl),
1488              DECL_SOURCE_LINE (current_function_decl));
1489   else
1490     fprintf (vect_dump, "\n%s:%d: note: ",
1491              LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1492
1493   return true;
1494 }
1495
1496
1497 /*************************************************************************
1498   Vectorization Utilities.
1499  *************************************************************************/
1500
1501 /* Function new_stmt_vec_info.
1502
1503    Create and initialize a new stmt_vec_info struct for STMT.  */
1504
1505 stmt_vec_info
1506 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1507 {
1508   stmt_vec_info res;
1509   res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1510
1511   STMT_VINFO_TYPE (res) = undef_vec_info_type;
1512   STMT_VINFO_STMT (res) = stmt;
1513   STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1514   STMT_VINFO_RELEVANT (res) = 0;
1515   STMT_VINFO_LIVE_P (res) = false;
1516   STMT_VINFO_VECTYPE (res) = NULL;
1517   STMT_VINFO_VEC_STMT (res) = NULL;
1518   STMT_VINFO_IN_PATTERN_P (res) = false;
1519   STMT_VINFO_RELATED_STMT (res) = NULL;
1520   STMT_VINFO_DATA_REF (res) = NULL;
1521
1522   STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1523   STMT_VINFO_DR_OFFSET (res) = NULL;
1524   STMT_VINFO_DR_INIT (res) = NULL;
1525   STMT_VINFO_DR_STEP (res) = NULL;
1526   STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1527
1528   if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1529     STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1530   else
1531     STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1532   STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1533   STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1534   STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1535   STMT_SLP_TYPE (res) = 0;
1536   DR_GROUP_FIRST_DR (res) = NULL_TREE;
1537   DR_GROUP_NEXT_DR (res) = NULL_TREE;
1538   DR_GROUP_SIZE (res) = 0;
1539   DR_GROUP_STORE_COUNT (res) = 0;
1540   DR_GROUP_GAP (res) = 0;
1541   DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1542   DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1543
1544   return res;
1545 }
1546
1547
1548 /* Function bb_in_loop_p
1549
1550    Used as predicate for dfs order traversal of the loop bbs.  */
1551
1552 static bool
1553 bb_in_loop_p (const_basic_block bb, const void *data)
1554 {
1555   const struct loop *const loop = (const struct loop *)data;
1556   if (flow_bb_inside_loop_p (loop, bb))
1557     return true;
1558   return false;
1559 }
1560
1561
1562 /* Function new_loop_vec_info.
1563
1564    Create and initialize a new loop_vec_info struct for LOOP, as well as
1565    stmt_vec_info structs for all the stmts in LOOP.  */
1566
1567 loop_vec_info
1568 new_loop_vec_info (struct loop *loop)
1569 {
1570   loop_vec_info res;
1571   basic_block *bbs;
1572   block_stmt_iterator si;
1573   unsigned int i, nbbs;
1574
1575   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1576   LOOP_VINFO_LOOP (res) = loop;
1577
1578   bbs = get_loop_body (loop);
1579
1580   /* Create/Update stmt_info for all stmts in the loop.  */
1581   for (i = 0; i < loop->num_nodes; i++)
1582     {
1583       basic_block bb = bbs[i];
1584       tree phi;
1585
1586       /* BBs in a nested inner-loop will have been already processed (because
1587          we will have called vect_analyze_loop_form for any nested inner-loop).
1588          Therefore, for stmts in an inner-loop we just want to update the
1589          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1590          loop_info of the outer-loop we are currently considering to vectorize
1591          (instead of the loop_info of the inner-loop).
1592          For stmts in other BBs we need to create a stmt_info from scratch.  */
1593       if (bb->loop_father != loop)
1594         {
1595           /* Inner-loop bb.  */
1596           gcc_assert (loop->inner && bb->loop_father == loop->inner);
1597           for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1598             {
1599               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1600               loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1601               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1602               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1603             }
1604           for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1605            {
1606               tree stmt = bsi_stmt (si);
1607               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1608               loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1609               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1610               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1611            }
1612         }
1613       else
1614         {
1615           /* bb in current nest.  */
1616           for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1617             {
1618               stmt_ann_t ann = get_stmt_ann (phi);
1619               set_stmt_info (ann, new_stmt_vec_info (phi, res));
1620             }
1621
1622           for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1623             {
1624               tree stmt = bsi_stmt (si);
1625               stmt_ann_t ann = stmt_ann (stmt);
1626               set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1627             }
1628         }
1629     }
1630
1631   /* CHECKME: We want to visit all BBs before their successors (except for
1632      latch blocks, for which this assertion wouldn't hold).  In the simple
1633      case of the loop forms we allow, a dfs order of the BBs would the same
1634      as reversed postorder traversal, so we are safe.  */
1635
1636    free (bbs);
1637    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1638    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1639                               bbs, loop->num_nodes, loop);
1640    gcc_assert (nbbs == loop->num_nodes);
1641
1642   LOOP_VINFO_BBS (res) = bbs;
1643   LOOP_VINFO_NITERS (res) = NULL;
1644   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1645   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1646   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1647   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1648   LOOP_VINFO_VECT_FACTOR (res) = 0;
1649   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1650   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1651   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1652   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1653     VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1654   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1655     VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1656   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1657   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1658   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1659
1660   return res;
1661 }
1662
1663
1664 /* Function destroy_loop_vec_info.
1665
1666    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1667    stmts in the loop.  */
1668
1669 void
1670 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1671 {
1672   struct loop *loop;
1673   basic_block *bbs;
1674   int nbbs;
1675   block_stmt_iterator si;
1676   int j;
1677   VEC (slp_instance, heap) *slp_instances;
1678   slp_instance instance;
1679
1680   if (!loop_vinfo)
1681     return;
1682
1683   loop = LOOP_VINFO_LOOP (loop_vinfo);
1684
1685   bbs = LOOP_VINFO_BBS (loop_vinfo);
1686   nbbs = loop->num_nodes;
1687
1688   if (!clean_stmts)
1689     {
1690       free (LOOP_VINFO_BBS (loop_vinfo));
1691       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1692       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1693       VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1694
1695       free (loop_vinfo);
1696       loop->aux = NULL;
1697       return;
1698     }
1699
1700   for (j = 0; j < nbbs; j++)
1701     {
1702       basic_block bb = bbs[j];
1703       tree phi;
1704       stmt_vec_info stmt_info;
1705
1706       for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1707         {
1708           stmt_ann_t ann = stmt_ann (phi);
1709
1710           stmt_info = vinfo_for_stmt (phi);
1711           free (stmt_info);
1712           set_stmt_info (ann, NULL);
1713         }
1714
1715       for (si = bsi_start (bb); !bsi_end_p (si); )
1716         {
1717           tree stmt = bsi_stmt (si);
1718           stmt_ann_t ann = stmt_ann (stmt);
1719           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1720
1721           if (stmt_info)
1722             {
1723               /* Check if this is a "pattern stmt" (introduced by the
1724                  vectorizer during the pattern recognition pass).  */
1725               bool remove_stmt_p = false;
1726               tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1727               if (orig_stmt)
1728                 {
1729                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1730                   if (orig_stmt_info
1731                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1732                     remove_stmt_p = true;
1733                 }
1734
1735               /* Free stmt_vec_info.  */
1736               VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1737               free (stmt_info);
1738               set_stmt_info (ann, NULL);
1739
1740               /* Remove dead "pattern stmts".  */
1741               if (remove_stmt_p)
1742                 bsi_remove (&si, true);
1743             }
1744           bsi_next (&si);
1745         }
1746     }
1747
1748   free (LOOP_VINFO_BBS (loop_vinfo));
1749   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1750   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1751   VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1752   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1753   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1754   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1755     vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1756   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1757
1758   free (loop_vinfo);
1759   loop->aux = NULL;
1760 }
1761
1762
1763 /* Function vect_force_dr_alignment_p.
1764
1765    Returns whether the alignment of a DECL can be forced to be aligned
1766    on ALIGNMENT bit boundary.  */
1767
1768 bool
1769 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1770 {
1771   if (TREE_CODE (decl) != VAR_DECL)
1772     return false;
1773
1774   if (DECL_EXTERNAL (decl))
1775     return false;
1776
1777   if (TREE_ASM_WRITTEN (decl))
1778     return false;
1779
1780   if (TREE_STATIC (decl))
1781     return (alignment <= MAX_OFILE_ALIGNMENT);
1782   else
1783     /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1784        correct until someone implements forced stack alignment.  */
1785     return (alignment <= STACK_BOUNDARY);
1786 }
1787
1788
1789 /* Function get_vectype_for_scalar_type.
1790
1791    Returns the vector type corresponding to SCALAR_TYPE as supported
1792    by the target.  */
1793
1794 tree
1795 get_vectype_for_scalar_type (tree scalar_type)
1796 {
1797   enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1798   int nbytes = GET_MODE_SIZE (inner_mode);
1799   int nunits;
1800   tree vectype;
1801
1802   if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1803     return NULL_TREE;
1804
1805   /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
1806      is expected.  */
1807   nunits = UNITS_PER_SIMD_WORD / nbytes;
1808
1809   vectype = build_vector_type (scalar_type, nunits);
1810   if (vect_print_dump_info (REPORT_DETAILS))
1811     {
1812       fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1813       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1814     }
1815
1816   if (!vectype)
1817     return NULL_TREE;
1818
1819   if (vect_print_dump_info (REPORT_DETAILS))
1820     {
1821       fprintf (vect_dump, "vectype: ");
1822       print_generic_expr (vect_dump, vectype, TDF_SLIM);
1823     }
1824
1825   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1826       && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1827     {
1828       if (vect_print_dump_info (REPORT_DETAILS))
1829         fprintf (vect_dump, "mode not supported by target.");
1830       return NULL_TREE;
1831     }
1832
1833   return vectype;
1834 }
1835
1836
1837 /* Function vect_supportable_dr_alignment
1838
1839    Return whether the data reference DR is supported with respect to its
1840    alignment.  */
1841
1842 enum dr_alignment_support
1843 vect_supportable_dr_alignment (struct data_reference *dr)
1844 {
1845   tree stmt = DR_STMT (dr);
1846   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1847   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1848   enum machine_mode mode = (int) TYPE_MODE (vectype);
1849   struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1850   bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1851   bool invariant_in_outerloop = false;
1852
1853   if (aligned_access_p (dr))
1854     return dr_aligned;
1855
1856   if (nested_in_vect_loop)
1857     {
1858       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1859       invariant_in_outerloop =
1860         (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1861     }
1862
1863   /* Possibly unaligned access.  */
1864
1865   /* We can choose between using the implicit realignment scheme (generating
1866      a misaligned_move stmt) and the explicit realignment scheme (generating
1867      aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1868      realignment scheme: optimized, and unoptimized.
1869      We can optimize the realignment only if the step between consecutive
1870      vector loads is equal to the vector size.  Since the vector memory
1871      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1872      is guaranteed that the misalignment amount remains the same throughout the
1873      execution of the vectorized loop.  Therefore, we can create the
1874      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1875      at the loop preheader.
1876
1877      However, in the case of outer-loop vectorization, when vectorizing a
1878      memory access in the inner-loop nested within the LOOP that is now being
1879      vectorized, while it is guaranteed that the misalignment of the
1880      vectorized memory access will remain the same in different outer-loop
1881      iterations, it is *not* guaranteed that is will remain the same throughout
1882      the execution of the inner-loop.  This is because the inner-loop advances
1883      with the original scalar step (and not in steps of VS).  If the inner-loop
1884      step happens to be a multiple of VS, then the misalignment remains fixed
1885      and we can use the optimized realignment scheme.  For example:
1886
1887       for (i=0; i<N; i++)
1888         for (j=0; j<M; j++)
1889           s += a[i+j];
1890
1891      When vectorizing the i-loop in the above example, the step between
1892      consecutive vector loads is 1, and so the misalignment does not remain
1893      fixed across the execution of the inner-loop, and the realignment cannot
1894      be optimized (as illustrated in the following pseudo vectorized loop):
1895
1896       for (i=0; i<N; i+=4)
1897         for (j=0; j<M; j++){
1898           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1899                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
1900                          // (assuming that we start from an aligned address).
1901           }
1902
1903      We therefore have to use the unoptimized realignment scheme:
1904
1905       for (i=0; i<N; i+=4)
1906           for (j=k; j<M; j+=4)
1907           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1908                            // that the misalignment of the initial address is
1909                            // 0).
1910
1911      The loop can then be vectorized as follows:
1912
1913       for (k=0; k<4; k++){
1914         rt = get_realignment_token (&vp[k]);
1915         for (i=0; i<N; i+=4){
1916           v1 = vp[i+k];
1917           for (j=k; j<M; j+=4){
1918             v2 = vp[i+j+VS-1];
1919             va = REALIGN_LOAD <v1,v2,rt>;
1920             vs += va;
1921             v1 = v2;
1922           }
1923         }
1924     } */
1925
1926   if (DR_IS_READ (dr))
1927     {
1928       if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1929                                                              CODE_FOR_nothing
1930           && (!targetm.vectorize.builtin_mask_for_load
1931               || targetm.vectorize.builtin_mask_for_load ()))
1932         {
1933             if (nested_in_vect_loop
1934                 && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
1935               return dr_explicit_realign;
1936             else
1937               return dr_explicit_realign_optimized;
1938         }
1939
1940       if (optab_handler (movmisalign_optab, mode)->insn_code !=
1941                                                              CODE_FOR_nothing)
1942         /* Can't software pipeline the loads, but can at least do them.  */
1943         return dr_unaligned_supported;
1944     }
1945
1946   /* Unsupported.  */
1947   return dr_unaligned_unsupported;
1948 }
1949
1950
1951 /* Function vect_is_simple_use.
1952
1953    Input:
1954    LOOP - the loop that is being vectorized.
1955    OPERAND - operand of a stmt in LOOP.
1956    DEF - the defining stmt in case OPERAND is an SSA_NAME.
1957
1958    Returns whether a stmt with OPERAND can be vectorized.
1959    Supportable operands are constants, loop invariants, and operands that are
1960    defined by the current iteration of the loop. Unsupportable operands are
1961    those that are defined by a previous iteration of the loop (as is the case
1962    in reduction/induction computations).  */
1963
1964 bool
1965 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1966                     tree *def, enum vect_def_type *dt)
1967 {
1968   basic_block bb;
1969   stmt_vec_info stmt_vinfo;
1970   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971
1972   *def_stmt = NULL_TREE;
1973   *def = NULL_TREE;
1974
1975   if (vect_print_dump_info (REPORT_DETAILS))
1976     {
1977       fprintf (vect_dump, "vect_is_simple_use: operand ");
1978       print_generic_expr (vect_dump, operand, TDF_SLIM);
1979     }
1980
1981   if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1982     {
1983       *dt = vect_constant_def;
1984       return true;
1985     }
1986   if (is_gimple_min_invariant (operand))
1987    {
1988       *def = operand;
1989       *dt = vect_invariant_def;
1990       return true;
1991    }
1992
1993   if (TREE_CODE (operand) == PAREN_EXPR)
1994     {
1995       if (vect_print_dump_info (REPORT_DETAILS))
1996         fprintf (vect_dump, "non-associatable copy.");
1997       operand = TREE_OPERAND (operand, 0);
1998     }
1999   if (TREE_CODE (operand) != SSA_NAME)
2000     {
2001       if (vect_print_dump_info (REPORT_DETAILS))
2002         fprintf (vect_dump, "not ssa-name.");
2003       return false;
2004     }
2005
2006   *def_stmt = SSA_NAME_DEF_STMT (operand);
2007   if (*def_stmt == NULL_TREE )
2008     {
2009       if (vect_print_dump_info (REPORT_DETAILS))
2010         fprintf (vect_dump, "no def_stmt.");
2011       return false;
2012     }
2013
2014   if (vect_print_dump_info (REPORT_DETAILS))
2015     {
2016       fprintf (vect_dump, "def_stmt: ");
2017       print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2018     }
2019
2020   /* empty stmt is expected only in case of a function argument.
2021      (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT).  */
2022   if (IS_EMPTY_STMT (*def_stmt))
2023     {
2024       tree arg = TREE_OPERAND (*def_stmt, 0);
2025       if (is_gimple_min_invariant (arg))
2026         {
2027           *def = operand;
2028           *dt = vect_invariant_def;
2029           return true;
2030         }
2031
2032       if (vect_print_dump_info (REPORT_DETAILS))
2033         fprintf (vect_dump, "Unexpected empty stmt.");
2034       return false;
2035     }
2036
2037   bb = bb_for_stmt (*def_stmt);
2038   if (!flow_bb_inside_loop_p (loop, bb))
2039     *dt = vect_invariant_def;
2040   else
2041     {
2042       stmt_vinfo = vinfo_for_stmt (*def_stmt);
2043       *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2044     }
2045
2046   if (*dt == vect_unknown_def_type)
2047     {
2048       if (vect_print_dump_info (REPORT_DETAILS))
2049         fprintf (vect_dump, "Unsupported pattern.");
2050       return false;
2051     }
2052
2053   if (vect_print_dump_info (REPORT_DETAILS))
2054     fprintf (vect_dump, "type of def: %d.",*dt);
2055
2056   switch (TREE_CODE (*def_stmt))
2057     {
2058     case PHI_NODE:
2059       *def = PHI_RESULT (*def_stmt);
2060       break;
2061
2062     case GIMPLE_MODIFY_STMT:
2063       *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2064       break;
2065
2066     default:
2067       if (vect_print_dump_info (REPORT_DETAILS))
2068         fprintf (vect_dump, "unsupported defining stmt: ");
2069       return false;
2070     }
2071
2072   return true;
2073 }
2074
2075
2076 /* Function supportable_widening_operation
2077
2078    Check whether an operation represented by the code CODE is a
2079    widening operation that is supported by the target platform in
2080    vector form (i.e., when operating on arguments of type VECTYPE).
2081
2082    Widening operations we currently support are NOP (CONVERT), FLOAT
2083    and WIDEN_MULT.  This function checks if these operations are supported
2084    by the target platform either directly (via vector tree-codes), or via
2085    target builtins.
2086
2087    Output:
2088    - CODE1 and CODE2 are codes of vector operations to be used when
2089    vectorizing the operation, if available.
2090    - DECL1 and DECL2 are decls of target builtin functions to be used
2091    when vectorizing the operation, if available. In this case,
2092    CODE1 and CODE2 are CALL_EXPR.  */
2093
2094 bool
2095 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2096                                 tree *decl1, tree *decl2,
2097                                 enum tree_code *code1, enum tree_code *code2)
2098 {
2099   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2100   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2101   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2102   bool ordered_p;
2103   enum machine_mode vec_mode;
2104   enum insn_code icode1, icode2;
2105   optab optab1, optab2;
2106   tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2107   tree type = TREE_TYPE (expr);
2108   tree wide_vectype = get_vectype_for_scalar_type (type);
2109   enum tree_code c1, c2;
2110
2111   /* The result of a vectorized widening operation usually requires two vectors
2112      (because the widened results do not fit int one vector). The generated
2113      vector results would normally be expected to be generated in the same
2114      order as in the original scalar computation. i.e. if 8 results are
2115      generated in each vector iteration, they are to be organized as follows:
2116         vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2117
2118      However, in the special case that the result of the widening operation is
2119      used in a reduction computation only, the order doesn't matter (because
2120      when vectorizing a reduction we change the order of the computation).
2121      Some targets can take advantage of this and generate more efficient code.
2122      For example, targets like Altivec, that support widen_mult using a sequence
2123      of {mult_even,mult_odd} generate the following vectors:
2124         vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2125
2126      When vectorizaing outer-loops, we execute the inner-loop sequentially
2127      (each vectorized inner-loop iteration contributes to VF outer-loop
2128      iterations in parallel). We therefore don't allow to change the order
2129      of the computation in the inner-loop during outer-loop vectorization.  */
2130
2131    if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2132        && !nested_in_vect_loop_p (vect_loop, stmt))
2133      ordered_p = false;
2134    else
2135      ordered_p = true;
2136
2137   if (!ordered_p
2138       && code == WIDEN_MULT_EXPR
2139       && targetm.vectorize.builtin_mul_widen_even
2140       && targetm.vectorize.builtin_mul_widen_even (vectype)
2141       && targetm.vectorize.builtin_mul_widen_odd
2142       && targetm.vectorize.builtin_mul_widen_odd (vectype))
2143     {
2144       if (vect_print_dump_info (REPORT_DETAILS))
2145         fprintf (vect_dump, "Unordered widening operation detected.");
2146
2147       *code1 = *code2 = CALL_EXPR;
2148       *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2149       *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2150       return true;
2151     }
2152
2153   switch (code)
2154     {
2155     case WIDEN_MULT_EXPR:
2156       if (BYTES_BIG_ENDIAN)
2157         {
2158           c1 = VEC_WIDEN_MULT_HI_EXPR;
2159           c2 = VEC_WIDEN_MULT_LO_EXPR;
2160         }
2161       else
2162         {
2163           c2 = VEC_WIDEN_MULT_HI_EXPR;
2164           c1 = VEC_WIDEN_MULT_LO_EXPR;
2165         }
2166       break;
2167
2168     case NOP_EXPR:
2169     case CONVERT_EXPR:
2170       if (BYTES_BIG_ENDIAN)
2171         {
2172           c1 = VEC_UNPACK_HI_EXPR;
2173           c2 = VEC_UNPACK_LO_EXPR;
2174         }
2175       else
2176         {
2177           c2 = VEC_UNPACK_HI_EXPR;
2178           c1 = VEC_UNPACK_LO_EXPR;
2179         }
2180       break;
2181
2182     case FLOAT_EXPR:
2183       if (BYTES_BIG_ENDIAN)
2184         {
2185           c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2186           c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2187         }
2188       else
2189         {
2190           c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2191           c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2192         }
2193       break;
2194
2195     case FIX_TRUNC_EXPR:
2196       /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2197          VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2198          computing the operation.  */
2199       return false;
2200
2201     default:
2202       gcc_unreachable ();
2203     }
2204
2205   if (code == FIX_TRUNC_EXPR)
2206     {
2207       /* The signedness is determined from output operand.  */
2208       optab1 = optab_for_tree_code (c1, type);
2209       optab2 = optab_for_tree_code (c2, type);
2210     }
2211   else
2212     {
2213       optab1 = optab_for_tree_code (c1, vectype);
2214       optab2 = optab_for_tree_code (c2, vectype);
2215     }
2216
2217   if (!optab1 || !optab2)
2218     return false;
2219
2220   vec_mode = TYPE_MODE (vectype);
2221   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2222       || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2223       || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2224                                                         == CODE_FOR_nothing
2225       || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2226     return false;
2227
2228   *code1 = c1;
2229   *code2 = c2;
2230   return true;
2231 }
2232
2233
2234 /* Function supportable_narrowing_operation
2235
2236    Check whether an operation represented by the code CODE is a
2237    narrowing operation that is supported by the target platform in
2238    vector form (i.e., when operating on arguments of type VECTYPE).
2239
2240    Narrowing operations we currently support are NOP (CONVERT) and
2241    FIX_TRUNC. This function checks if these operations are supported by
2242    the target platform directly via vector tree-codes.
2243
2244    Output:
2245    - CODE1 is the code of a vector operation to be used when
2246    vectorizing the operation, if available.  */
2247
2248 bool
2249 supportable_narrowing_operation (enum tree_code code,
2250                                  const_tree stmt, const_tree vectype,
2251                                  enum tree_code *code1)
2252 {
2253   enum machine_mode vec_mode;
2254   enum insn_code icode1;
2255   optab optab1;
2256   tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2257   tree type = TREE_TYPE (expr);
2258   tree narrow_vectype = get_vectype_for_scalar_type (type);
2259   enum tree_code c1;
2260
2261   switch (code)
2262     {
2263     case NOP_EXPR:
2264     case CONVERT_EXPR:
2265       c1 = VEC_PACK_TRUNC_EXPR;
2266       break;
2267
2268     case FIX_TRUNC_EXPR:
2269       c1 = VEC_PACK_FIX_TRUNC_EXPR;
2270       break;
2271
2272     case FLOAT_EXPR:
2273       /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2274          tree code and optabs used for computing the operation.  */
2275       return false;
2276
2277     default:
2278       gcc_unreachable ();
2279     }
2280
2281   if (code == FIX_TRUNC_EXPR)
2282     /* The signedness is determined from output operand.  */
2283     optab1 = optab_for_tree_code (c1, type);
2284   else
2285     optab1 = optab_for_tree_code (c1, vectype);
2286
2287   if (!optab1)
2288     return false;
2289
2290   vec_mode = TYPE_MODE (vectype);
2291   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2292       || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2293     return false;
2294
2295   *code1 = c1;
2296   return true;
2297 }
2298
2299
2300 /* Function reduction_code_for_scalar_code
2301
2302    Input:
2303    CODE - tree_code of a reduction operations.
2304
2305    Output:
2306    REDUC_CODE - the corresponding tree-code to be used to reduce the
2307       vector of partial results into a single scalar result (which
2308       will also reside in a vector).
2309
2310    Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
2311
2312 bool
2313 reduction_code_for_scalar_code (enum tree_code code,
2314                                 enum tree_code *reduc_code)
2315 {
2316   switch (code)
2317   {
2318   case MAX_EXPR:
2319     *reduc_code = REDUC_MAX_EXPR;
2320     return true;
2321
2322   case MIN_EXPR:
2323     *reduc_code = REDUC_MIN_EXPR;
2324     return true;
2325
2326   case PLUS_EXPR:
2327     *reduc_code = REDUC_PLUS_EXPR;
2328     return true;
2329
2330   default:
2331     return false;
2332   }
2333 }
2334
2335
2336 /* Function vect_is_simple_reduction
2337
2338    Detect a cross-iteration def-use cycle that represents a simple
2339    reduction computation. We look for the following pattern:
2340
2341    loop_header:
2342      a1 = phi < a0, a2 >
2343      a3 = ...
2344      a2 = operation (a3, a1)
2345
2346    such that:
2347    1. operation is commutative and associative and it is safe to
2348       change the order of the computation.
2349    2. no uses for a2 in the loop (a2 is used out of the loop)
2350    3. no uses of a1 in the loop besides the reduction operation.
2351
2352    Condition 1 is tested here.
2353    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
2354
2355 tree
2356 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2357 {
2358   struct loop *loop = (bb_for_stmt (phi))->loop_father;
2359   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2360   edge latch_e = loop_latch_edge (loop);
2361   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2362   tree def_stmt, def1, def2;
2363   enum tree_code code;
2364   int op_type;
2365   tree operation, op1, op2;
2366   tree type;
2367   int nloop_uses;
2368   tree name;
2369   imm_use_iterator imm_iter;
2370   use_operand_p use_p;
2371
2372   gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2373
2374   name = PHI_RESULT (phi);
2375   nloop_uses = 0;
2376   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2377     {
2378       tree use_stmt = USE_STMT (use_p);
2379       if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2380           && vinfo_for_stmt (use_stmt)
2381           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2382         nloop_uses++;
2383       if (nloop_uses > 1)
2384         {
2385           if (vect_print_dump_info (REPORT_DETAILS))
2386             fprintf (vect_dump, "reduction used in loop.");
2387           return NULL_TREE;
2388         }
2389     }
2390
2391   if (TREE_CODE (loop_arg) != SSA_NAME)
2392     {
2393       if (vect_print_dump_info (REPORT_DETAILS))
2394         {
2395           fprintf (vect_dump, "reduction: not ssa_name: ");
2396           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2397         }
2398       return NULL_TREE;
2399     }
2400
2401   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2402   if (!def_stmt)
2403     {
2404       if (vect_print_dump_info (REPORT_DETAILS))
2405         fprintf (vect_dump, "reduction: no def_stmt.");
2406       return NULL_TREE;
2407     }
2408
2409   if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2410     {
2411       if (vect_print_dump_info (REPORT_DETAILS))
2412         print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2413       return NULL_TREE;
2414     }
2415
2416   name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2417   nloop_uses = 0;
2418   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2419     {
2420       tree use_stmt = USE_STMT (use_p);
2421       if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2422           && vinfo_for_stmt (use_stmt)
2423           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2424         nloop_uses++;
2425       if (nloop_uses > 1)
2426         {
2427           if (vect_print_dump_info (REPORT_DETAILS))
2428             fprintf (vect_dump, "reduction used in loop.");
2429           return NULL_TREE;
2430         }
2431     }
2432
2433   operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2434   code = TREE_CODE (operation);
2435   if (!commutative_tree_code (code) || !associative_tree_code (code))
2436     {
2437       if (vect_print_dump_info (REPORT_DETAILS))
2438         {
2439           fprintf (vect_dump, "reduction: not commutative/associative: ");
2440           print_generic_expr (vect_dump, operation, TDF_SLIM);
2441         }
2442       return NULL_TREE;
2443     }
2444
2445   op_type = TREE_OPERAND_LENGTH (operation);
2446   if (op_type != binary_op)
2447     {
2448       if (vect_print_dump_info (REPORT_DETAILS))
2449         {
2450           fprintf (vect_dump, "reduction: not binary operation: ");
2451           print_generic_expr (vect_dump, operation, TDF_SLIM);
2452         }
2453       return NULL_TREE;
2454     }
2455
2456   op1 = TREE_OPERAND (operation, 0);
2457   op2 = TREE_OPERAND (operation, 1);
2458   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2459     {
2460       if (vect_print_dump_info (REPORT_DETAILS))
2461         {
2462           fprintf (vect_dump, "reduction: uses not ssa_names: ");
2463           print_generic_expr (vect_dump, operation, TDF_SLIM);
2464         }
2465       return NULL_TREE;
2466     }
2467
2468   /* Check that it's ok to change the order of the computation.  */
2469   type = TREE_TYPE (operation);
2470   if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2471       || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2472     {
2473       if (vect_print_dump_info (REPORT_DETAILS))
2474         {
2475           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2476           print_generic_expr (vect_dump, type, TDF_SLIM);
2477           fprintf (vect_dump, ", operands types: ");
2478           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2479           fprintf (vect_dump, ",");
2480           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2481         }
2482       return NULL_TREE;
2483     }
2484
2485   /* Generally, when vectorizing a reduction we change the order of the
2486      computation.  This may change the behavior of the program in some
2487      cases, so we need to check that this is ok.  One exception is when
2488      vectorizing an outer-loop: the inner-loop is executed sequentially,
2489      and therefore vectorizing reductions in the inner-loop durint
2490      outer-loop vectorization is safe.  */
2491
2492   /* CHECKME: check for !flag_finite_math_only too?  */
2493   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2494       && !nested_in_vect_loop_p (vect_loop, def_stmt))
2495     {
2496       /* Changing the order of operations changes the semantics.  */
2497       if (vect_print_dump_info (REPORT_DETAILS))
2498         {
2499           fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2500           print_generic_expr (vect_dump, operation, TDF_SLIM);
2501         }
2502       return NULL_TREE;
2503     }
2504   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2505            && !nested_in_vect_loop_p (vect_loop, def_stmt))
2506     {
2507       /* Changing the order of operations changes the semantics.  */
2508       if (vect_print_dump_info (REPORT_DETAILS))
2509         {
2510           fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2511           print_generic_expr (vect_dump, operation, TDF_SLIM);
2512         }
2513       return NULL_TREE;
2514     }
2515   else if (SAT_FIXED_POINT_TYPE_P (type))
2516     {
2517       /* Changing the order of operations changes the semantics.  */
2518       if (vect_print_dump_info (REPORT_DETAILS))
2519         {
2520           fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2521           print_generic_expr (vect_dump, operation, TDF_SLIM);
2522         }
2523       return NULL_TREE;
2524     }
2525
2526   /* reduction is safe. we're dealing with one of the following:
2527      1) integer arithmetic and no trapv
2528      2) floating point arithmetic, and special flags permit this optimization.
2529    */
2530   def1 = SSA_NAME_DEF_STMT (op1);
2531   def2 = SSA_NAME_DEF_STMT (op2);
2532   if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2533     {
2534       if (vect_print_dump_info (REPORT_DETAILS))
2535         {
2536           fprintf (vect_dump, "reduction: no defs for operands: ");
2537           print_generic_expr (vect_dump, operation, TDF_SLIM);
2538         }
2539       return NULL_TREE;
2540     }
2541
2542
2543   /* Check that one def is the reduction def, defined by PHI,
2544      the other def is either defined in the loop ("vect_loop_def"),
2545      or it's an induction (defined by a loop-header phi-node).  */
2546
2547   if (def2 == phi
2548       && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2549       && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2550           || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2551           || (TREE_CODE (def1) == PHI_NODE
2552               && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2553               && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2554     {
2555       if (vect_print_dump_info (REPORT_DETAILS))
2556         {
2557           fprintf (vect_dump, "detected reduction:");
2558           print_generic_expr (vect_dump, operation, TDF_SLIM);
2559         }
2560       return def_stmt;
2561     }
2562   else if (def1 == phi
2563            && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2564            && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2565                || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2566                || (TREE_CODE (def2) == PHI_NODE
2567                    && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2568                    && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2569     {
2570       /* Swap operands (just for simplicity - so that the rest of the code
2571          can assume that the reduction variable is always the last (second)
2572          argument).  */
2573       if (vect_print_dump_info (REPORT_DETAILS))
2574         {
2575           fprintf (vect_dump, "detected reduction: need to swap operands:");
2576           print_generic_expr (vect_dump, operation, TDF_SLIM);
2577         }
2578       swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2579                                     &TREE_OPERAND (operation, 1));
2580       return def_stmt;
2581     }
2582   else
2583     {
2584       if (vect_print_dump_info (REPORT_DETAILS))
2585         {
2586           fprintf (vect_dump, "reduction: unknown pattern.");
2587           print_generic_expr (vect_dump, operation, TDF_SLIM);
2588         }
2589       return NULL_TREE;
2590     }
2591 }
2592
2593
2594 /* Function vect_is_simple_iv_evolution.
2595
2596    FORNOW: A simple evolution of an induction variables in the loop is
2597    considered a polynomial evolution with constant step.  */
2598
2599 bool
2600 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2601                              tree * step)
2602 {
2603   tree init_expr;
2604   tree step_expr;
2605   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2606
2607   /* When there is no evolution in this loop, the evolution function
2608      is not "simple".  */
2609   if (evolution_part == NULL_TREE)
2610     return false;
2611
2612   /* When the evolution is a polynomial of degree >= 2
2613      the evolution function is not "simple".  */
2614   if (tree_is_chrec (evolution_part))
2615     return false;
2616
2617   step_expr = evolution_part;
2618   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2619
2620   if (vect_print_dump_info (REPORT_DETAILS))
2621     {
2622       fprintf (vect_dump, "step: ");
2623       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2624       fprintf (vect_dump, ",  init: ");
2625       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2626     }
2627
2628   *init = init_expr;
2629   *step = step_expr;
2630
2631   if (TREE_CODE (step_expr) != INTEGER_CST)
2632     {
2633       if (vect_print_dump_info (REPORT_DETAILS))
2634         fprintf (vect_dump, "step unknown.");
2635       return false;
2636     }
2637
2638   return true;
2639 }
2640
2641
2642 /* Function vectorize_loops.
2643
2644    Entry Point to loop vectorization phase.  */
2645
2646 unsigned
2647 vectorize_loops (void)
2648 {
2649   unsigned int i;
2650   unsigned int num_vectorized_loops = 0;
2651   unsigned int vect_loops_num;
2652   loop_iterator li;
2653   struct loop *loop;
2654
2655   vect_loops_num = number_of_loops ();
2656
2657   /* Bail out if there are no loops.  */
2658   if (vect_loops_num <= 1)
2659     return 0;
2660
2661   /* Fix the verbosity level if not defined explicitly by the user.  */
2662   vect_set_dump_settings ();
2663
2664   /* Allocate the bitmap that records which virtual variables that
2665      need to be renamed.  */
2666   vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2667
2668   /*  ----------- Analyze loops. -----------  */
2669
2670   /* If some loop was duplicated, it gets bigger number
2671      than all previously defined loops. This fact allows us to run
2672      only over initial loops skipping newly generated ones.  */
2673   FOR_EACH_LOOP (li, loop, 0)
2674     {
2675       loop_vec_info loop_vinfo;
2676
2677       vect_loop_location = find_loop_location (loop);
2678       loop_vinfo = vect_analyze_loop (loop);
2679       loop->aux = loop_vinfo;
2680
2681       if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2682         continue;
2683
2684       vect_transform_loop (loop_vinfo);
2685       num_vectorized_loops++;
2686     }
2687   vect_loop_location = UNKNOWN_LOC;
2688
2689   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2690       || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2691           && num_vectorized_loops > 0))
2692     fprintf (vect_dump, "vectorized %u loops in function.\n",
2693              num_vectorized_loops);
2694
2695   /*  ----------- Finalize. -----------  */
2696
2697   BITMAP_FREE (vect_memsyms_to_rename);
2698
2699   for (i = 1; i < vect_loops_num; i++)
2700     {
2701       loop_vec_info loop_vinfo;
2702
2703       loop = get_loop (i);
2704       if (!loop)
2705         continue;
2706       loop_vinfo = loop->aux;
2707       destroy_loop_vec_info (loop_vinfo, true);
2708       loop->aux = NULL;
2709     }
2710
2711   return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2712 }
2713
2714 /* Increase alignment of global arrays to improve vectorization potential.
2715    TODO:
2716    - Consider also structs that have an array field.
2717    - Use ipa analysis to prune arrays that can't be vectorized?
2718      This should involve global alignment analysis and in the future also
2719      array padding.  */
2720
2721 static unsigned int
2722 increase_alignment (void)
2723 {
2724   struct varpool_node *vnode;
2725
2726   /* Increase the alignment of all global arrays for vectorization.  */
2727   for (vnode = varpool_nodes_queue;
2728        vnode;
2729        vnode = vnode->next_needed)
2730     {
2731       tree vectype, decl = vnode->decl;
2732       unsigned int alignment;
2733
2734       if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2735         continue;
2736       vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2737       if (!vectype)
2738         continue;
2739       alignment = TYPE_ALIGN (vectype);
2740       if (DECL_ALIGN (decl) >= alignment)
2741         continue;
2742
2743       if (vect_can_force_dr_alignment_p (decl, alignment))
2744         {
2745           DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2746           DECL_USER_ALIGN (decl) = 1;
2747           if (dump_file)
2748             {
2749               fprintf (dump_file, "Increasing alignment of decl: ");
2750               print_generic_expr (dump_file, decl, TDF_SLIM);
2751             }
2752         }
2753     }
2754   return 0;
2755 }
2756
2757 static bool
2758 gate_increase_alignment (void)
2759 {
2760   return flag_section_anchors && flag_tree_vectorize;
2761 }
2762
2763 struct tree_opt_pass pass_ipa_increase_alignment =
2764 {
2765   "increase_alignment",                 /* name */
2766   gate_increase_alignment,              /* gate */
2767   increase_alignment,                   /* execute */
2768   NULL,                                 /* sub */
2769   NULL,                                 /* next */
2770   0,                                    /* static_pass_number */
2771   0,                                    /* tv_id */
2772   0,                                    /* properties_required */
2773   0,                                    /* properties_provided */
2774   0,                                    /* properties_destroyed */
2775   0,                                    /* todo_flags_start */
2776   0,                                    /* todo_flags_finish */
2777   0                                     /* letter */
2778 };