tree-data-ref.c (subscript_dependence_tester_1): Call free_conflict_function.
[gcc.git] / gcc / tree-vect-transform.c
1 /* Transformation Utilities for Loop Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "ggc.h"
26 #include "tree.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "recog.h"
39 #include "tree-data-ref.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "langhooks.h"
44 #include "tree-pass.h"
45 #include "toplev.h"
46 #include "real.h"
47
48 /* Utility functions for the code transformation. */
49 static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
50 static tree vect_create_destination_var (tree, tree);
51 static tree vect_create_data_ref_ptr
52 (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
53 static tree vect_create_addr_base_for_vector_ref
54 (tree, tree *, tree, struct loop *);
55 static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *);
56 static tree vect_get_vec_def_for_operand (tree, tree, tree *);
57 static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *);
58 static void vect_finish_stmt_generation
59 (tree stmt, tree vec_stmt, block_stmt_iterator *);
60 static bool vect_is_simple_cond (tree, loop_vec_info);
61 static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
62 static tree get_initial_def_for_reduction (tree, tree, tree *);
63
64 /* Utility function dealing with loop peeling (not peeling itself). */
65 static void vect_generate_tmps_on_preheader
66 (loop_vec_info, tree *, tree *, tree *);
67 static tree vect_build_loop_niters (loop_vec_info);
68 static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge);
69 static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree);
70 static void vect_update_init_of_dr (struct data_reference *, tree niters);
71 static void vect_update_inits_of_drs (loop_vec_info, tree);
72 static int vect_min_worthwhile_factor (enum tree_code);
73
74
75 static int
76 cost_for_stmt (tree stmt)
77 {
78 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
79
80 switch (STMT_VINFO_TYPE (stmt_info))
81 {
82 case load_vec_info_type:
83 return TARG_SCALAR_LOAD_COST;
84 case store_vec_info_type:
85 return TARG_SCALAR_STORE_COST;
86 case op_vec_info_type:
87 case condition_vec_info_type:
88 case assignment_vec_info_type:
89 case reduc_vec_info_type:
90 case induc_vec_info_type:
91 case type_promotion_vec_info_type:
92 case type_demotion_vec_info_type:
93 case type_conversion_vec_info_type:
94 case call_vec_info_type:
95 return TARG_SCALAR_STMT_COST;
96 case undef_vec_info_type:
97 default:
98 gcc_unreachable ();
99 }
100 }
101
102
103 /* Function vect_estimate_min_profitable_iters
104
105 Return the number of iterations required for the vector version of the
106 loop to be profitable relative to the cost of the scalar version of the
107 loop.
108
109 TODO: Take profile info into account before making vectorization
110 decisions, if available. */
111
112 int
113 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
114 {
115 int i;
116 int min_profitable_iters;
117 int peel_iters_prologue;
118 int peel_iters_epilogue;
119 int vec_inside_cost = 0;
120 int vec_outside_cost = 0;
121 int scalar_single_iter_cost = 0;
122 int scalar_outside_cost = 0;
123 bool runtime_test = false;
124 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
125 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
127 int nbbs = loop->num_nodes;
128 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
129 int peel_guard_costs = 0;
130 int innerloop_iters = 0, factor;
131 VEC (slp_instance, heap) *slp_instances;
132 slp_instance instance;
133
134 /* Cost model disabled. */
135 if (!flag_vect_cost_model)
136 {
137 if (vect_print_dump_info (REPORT_COST))
138 fprintf (vect_dump, "cost model disabled.");
139 return 0;
140 }
141
142 /* If the number of iterations is unknown, or the
143 peeling-for-misalignment amount is unknown, we will have to generate
144 a runtime test to test the loop count against the threshold. */
145 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
146 || (byte_misalign < 0))
147 runtime_test = true;
148
149 /* Requires loop versioning tests to handle misalignment. */
150
151 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
152 {
153 /* FIXME: Make cost depend on complexity of individual check. */
154 vec_outside_cost +=
155 VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
156 if (vect_print_dump_info (REPORT_COST))
157 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
158 "versioning to treat misalignment.\n");
159 }
160
161 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
162 {
163 /* FIXME: Make cost depend on complexity of individual check. */
164 vec_outside_cost +=
165 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
166 if (vect_print_dump_info (REPORT_COST))
167 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
168 "versioning aliasing.\n");
169 }
170
171 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
172 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
173 {
174 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
175 }
176
177 /* Count statements in scalar loop. Using this as scalar cost for a single
178 iteration for now.
179
180 TODO: Add outer loop support.
181
182 TODO: Consider assigning different costs to different scalar
183 statements. */
184
185 /* FORNOW. */
186 if (loop->inner)
187 innerloop_iters = 50; /* FIXME */
188
189 for (i = 0; i < nbbs; i++)
190 {
191 block_stmt_iterator si;
192 basic_block bb = bbs[i];
193
194 if (bb->loop_father == loop->inner)
195 factor = innerloop_iters;
196 else
197 factor = 1;
198
199 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
200 {
201 tree stmt = bsi_stmt (si);
202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
203 /* Skip stmts that are not vectorized inside the loop. */
204 if (!STMT_VINFO_RELEVANT_P (stmt_info)
205 && (!STMT_VINFO_LIVE_P (stmt_info)
206 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
207 continue;
208 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
209 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
210 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
211 some of the "outside" costs are generated inside the outer-loop. */
212 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
213 }
214 }
215
216 /* Add additional cost for the peeled instructions in prologue and epilogue
217 loop.
218
219 FORNOW: If we dont know the value of peel_iters for prologue or epilogue
220 at compile-time - we assume it's vf/2 (the worst would be vf-1).
221
222 TODO: Build an expression that represents peel_iters for prologue and
223 epilogue to be used in a run-time test. */
224
225 if (byte_misalign < 0)
226 {
227 peel_iters_prologue = vf/2;
228 if (vect_print_dump_info (REPORT_COST))
229 fprintf (vect_dump, "cost model: "
230 "prologue peel iters set to vf/2.");
231
232 /* If peeling for alignment is unknown, loop bound of main loop becomes
233 unknown. */
234 peel_iters_epilogue = vf/2;
235 if (vect_print_dump_info (REPORT_COST))
236 fprintf (vect_dump, "cost model: "
237 "epilogue peel iters set to vf/2 because "
238 "peeling for alignment is unknown .");
239
240 /* If peeled iterations are unknown, count a taken branch and a not taken
241 branch per peeled loop. Even if scalar loop iterations are known,
242 vector iterations are not known since peeled prologue iterations are
243 not known. Hence guards remain the same. */
244 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
245 + TARG_COND_NOT_TAKEN_BRANCH_COST);
246
247 }
248 else
249 {
250 if (byte_misalign)
251 {
252 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
253 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
254 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
255 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
256
257 peel_iters_prologue = nelements - (byte_misalign / element_size);
258 }
259 else
260 peel_iters_prologue = 0;
261
262 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
263 {
264 peel_iters_epilogue = vf/2;
265 if (vect_print_dump_info (REPORT_COST))
266 fprintf (vect_dump, "cost model: "
267 "epilogue peel iters set to vf/2 because "
268 "loop iterations are unknown .");
269
270 /* If peeled iterations are known but number of scalar loop
271 iterations are unknown, count a taken branch per peeled loop. */
272 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
273
274 }
275 else
276 {
277 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
278 peel_iters_prologue = niters < peel_iters_prologue ?
279 niters : peel_iters_prologue;
280 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
281 }
282 }
283
284 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
285 + (peel_iters_epilogue * scalar_single_iter_cost)
286 + peel_guard_costs;
287
288 /* FORNOW: The scalar outside cost is incremented in one of the
289 following ways:
290
291 1. The vectorizer checks for alignment and aliasing and generates
292 a condition that allows dynamic vectorization. A cost model
293 check is ANDED with the versioning condition. Hence scalar code
294 path now has the added cost of the versioning check.
295
296 if (cost > th & versioning_check)
297 jmp to vector code
298
299 Hence run-time scalar is incremented by not-taken branch cost.
300
301 2. The vectorizer then checks if a prologue is required. If the
302 cost model check was not done before during versioning, it has to
303 be done before the prologue check.
304
305 if (cost <= th)
306 prologue = scalar_iters
307 if (prologue == 0)
308 jmp to vector code
309 else
310 execute prologue
311 if (prologue == num_iters)
312 go to exit
313
314 Hence the run-time scalar cost is incremented by a taken branch,
315 plus a not-taken branch, plus a taken branch cost.
316
317 3. The vectorizer then checks if an epilogue is required. If the
318 cost model check was not done before during prologue check, it
319 has to be done with the epilogue check.
320
321 if (prologue == 0)
322 jmp to vector code
323 else
324 execute prologue
325 if (prologue == num_iters)
326 go to exit
327 vector code:
328 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
329 jmp to epilogue
330
331 Hence the run-time scalar cost should be incremented by 2 taken
332 branches.
333
334 TODO: The back end may reorder the BBS's differently and reverse
335 conditions/branch directions. Change the stimates below to
336 something more reasonable. */
337
338 if (runtime_test)
339 {
340 /* Cost model check occurs at versioning. */
341 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
342 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
343 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
344 else
345 {
346 /* Cost model occurs at prologue generation. */
347 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
348 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
349 + TARG_COND_NOT_TAKEN_BRANCH_COST;
350 /* Cost model check occurs at epilogue generation. */
351 else
352 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
353 }
354 }
355
356 /* Add SLP costs. */
357 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
358 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
359 {
360 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
361 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
362 }
363
364 /* Calculate number of iterations required to make the vector version
365 profitable, relative to the loop bodies only. The following condition
366 must hold true:
367 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
368 where
369 SIC = scalar iteration cost, VIC = vector iteration cost,
370 VOC = vector outside cost, VF = vectorization factor,
371 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
372 SOC = scalar outside cost for run time cost model check. */
373
374 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
375 {
376 if (vec_outside_cost <= 0)
377 min_profitable_iters = 1;
378 else
379 {
380 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
381 - vec_inside_cost * peel_iters_prologue
382 - vec_inside_cost * peel_iters_epilogue)
383 / ((scalar_single_iter_cost * vf)
384 - vec_inside_cost);
385
386 if ((scalar_single_iter_cost * vf * min_profitable_iters)
387 <= ((vec_inside_cost * min_profitable_iters)
388 + ((vec_outside_cost - scalar_outside_cost) * vf)))
389 min_profitable_iters++;
390 }
391 }
392 /* vector version will never be profitable. */
393 else
394 {
395 if (vect_print_dump_info (REPORT_COST))
396 fprintf (vect_dump, "cost model: vector iteration cost = %d "
397 "is divisible by scalar iteration cost = %d by a factor "
398 "greater than or equal to the vectorization factor = %d .",
399 vec_inside_cost, scalar_single_iter_cost, vf);
400 return -1;
401 }
402
403 if (vect_print_dump_info (REPORT_COST))
404 {
405 fprintf (vect_dump, "Cost model analysis: \n");
406 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
407 vec_inside_cost);
408 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
409 vec_outside_cost);
410 fprintf (vect_dump, " Scalar iteration cost: %d\n",
411 scalar_single_iter_cost);
412 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
413 fprintf (vect_dump, " prologue iterations: %d\n",
414 peel_iters_prologue);
415 fprintf (vect_dump, " epilogue iterations: %d\n",
416 peel_iters_epilogue);
417 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
418 min_profitable_iters);
419 }
420
421 min_profitable_iters =
422 min_profitable_iters < vf ? vf : min_profitable_iters;
423
424 /* Because the condition we create is:
425 if (niters <= min_profitable_iters)
426 then skip the vectorized loop. */
427 min_profitable_iters--;
428
429 if (vect_print_dump_info (REPORT_COST))
430 fprintf (vect_dump, " Profitability threshold = %d\n",
431 min_profitable_iters);
432
433 return min_profitable_iters;
434 }
435
436
437 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
438 functions. Design better to avoid maintenance issues. */
439
440 /* Function vect_model_reduction_cost.
441
442 Models cost for a reduction operation, including the vector ops
443 generated within the strip-mine loop, the initial definition before
444 the loop, and the epilogue code that must be generated. */
445
446 static bool
447 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
448 int ncopies)
449 {
450 int outer_cost = 0;
451 enum tree_code code;
452 optab optab;
453 tree vectype;
454 tree orig_stmt;
455 tree reduction_op;
456 enum machine_mode mode;
457 tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1);
458 int op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
459 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
461
462 /* Cost of reduction op inside loop. */
463 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
464
465 reduction_op = TREE_OPERAND (operation, op_type-1);
466 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
467 if (!vectype)
468 {
469 if (vect_print_dump_info (REPORT_COST))
470 {
471 fprintf (vect_dump, "unsupported data-type ");
472 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
473 }
474 return false;
475 }
476
477 mode = TYPE_MODE (vectype);
478 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
479
480 if (!orig_stmt)
481 orig_stmt = STMT_VINFO_STMT (stmt_info);
482
483 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
484
485 /* Add in cost for initial definition. */
486 outer_cost += TARG_SCALAR_TO_VEC_COST;
487
488 /* Determine cost of epilogue code.
489
490 We have a reduction operator that will reduce the vector in one statement.
491 Also requires scalar extract. */
492
493 if (!nested_in_vect_loop_p (loop, orig_stmt))
494 {
495 if (reduc_code < NUM_TREE_CODES)
496 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
497 else
498 {
499 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
500 tree bitsize =
501 TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0)));
502 int element_bitsize = tree_low_cst (bitsize, 1);
503 int nelements = vec_size_in_bits / element_bitsize;
504
505 optab = optab_for_tree_code (code, vectype);
506
507 /* We have a whole vector shift available. */
508 if (VECTOR_MODE_P (mode)
509 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
510 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
511 /* Final reduction via vector shifts and the reduction operator. Also
512 requires scalar extract. */
513 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
514 + TARG_VEC_TO_SCALAR_COST);
515 else
516 /* Use extracts and reduction op for final reduction. For N elements,
517 we have N extracts and N-1 reduction ops. */
518 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
519 }
520 }
521
522 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
523
524 if (vect_print_dump_info (REPORT_COST))
525 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
526 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
527 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
528
529 return true;
530 }
531
532
533 /* Function vect_model_induction_cost.
534
535 Models cost for induction operations. */
536
537 static void
538 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
539 {
540 /* loop cost for vec_loop. */
541 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
542 /* prologue cost for vec_init and vec_step. */
543 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
544
545 if (vect_print_dump_info (REPORT_COST))
546 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
547 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
548 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
549 }
550
551
552 /* Function vect_model_simple_cost.
553
554 Models cost for simple operations, i.e. those that only emit ncopies of a
555 single op. Right now, this does not account for multiple insns that could
556 be generated for the single vector op. We will handle that shortly. */
557
558 void
559 vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
560 enum vect_def_type *dt, slp_tree slp_node)
561 {
562 int i;
563 int inside_cost = 0, outside_cost = 0;
564
565 inside_cost = ncopies * TARG_VEC_STMT_COST;
566
567 /* FORNOW: Assuming maximum 2 args per stmts. */
568 for (i = 0; i < 2; i++)
569 {
570 if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
571 outside_cost += TARG_SCALAR_TO_VEC_COST;
572 }
573
574 if (vect_print_dump_info (REPORT_COST))
575 fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
576 "outside_cost = %d .", inside_cost, outside_cost);
577
578 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
579 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
580 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
581 }
582
583
584 /* Function vect_cost_strided_group_size
585
586 For strided load or store, return the group_size only if it is the first
587 load or store of a group, else return 1. This ensures that group size is
588 only returned once per group. */
589
590 static int
591 vect_cost_strided_group_size (stmt_vec_info stmt_info)
592 {
593 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
594
595 if (first_stmt == STMT_VINFO_STMT (stmt_info))
596 return DR_GROUP_SIZE (stmt_info);
597
598 return 1;
599 }
600
601
602 /* Function vect_model_store_cost
603
604 Models cost for stores. In the case of strided accesses, one access
605 has the overhead of the strided access attributed to it. */
606
607 void
608 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
609 enum vect_def_type dt, slp_tree slp_node)
610 {
611 int group_size;
612 int inside_cost = 0, outside_cost = 0;
613
614 if (dt == vect_constant_def || dt == vect_invariant_def)
615 outside_cost = TARG_SCALAR_TO_VEC_COST;
616
617 /* Strided access? */
618 if (DR_GROUP_FIRST_DR (stmt_info))
619 group_size = vect_cost_strided_group_size (stmt_info);
620 /* Not a strided access. */
621 else
622 group_size = 1;
623
624 /* Is this an access in a group of stores, which provide strided access?
625 If so, add in the cost of the permutes. */
626 if (group_size > 1)
627 {
628 /* Uses a high and low interleave operation for each needed permute. */
629 inside_cost = ncopies * exact_log2(group_size) * group_size
630 * TARG_VEC_STMT_COST;
631
632 if (vect_print_dump_info (REPORT_COST))
633 fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
634 group_size);
635
636 }
637
638 /* Costs of the stores. */
639 inside_cost += ncopies * TARG_VEC_STORE_COST;
640
641 if (vect_print_dump_info (REPORT_COST))
642 fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
643 "outside_cost = %d .", inside_cost, outside_cost);
644
645 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
646 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
647 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
648 }
649
650
651 /* Function vect_model_load_cost
652
653 Models cost for loads. In the case of strided accesses, the last access
654 has the overhead of the strided access attributed to it. Since unaligned
655 accesses are supported for loads, we also account for the costs of the
656 access scheme chosen. */
657
658 void
659 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
660
661 {
662 int group_size;
663 int alignment_support_cheme;
664 tree first_stmt;
665 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
666 int inside_cost = 0, outside_cost = 0;
667
668 /* Strided accesses? */
669 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
670 if (first_stmt && !slp_node)
671 {
672 group_size = vect_cost_strided_group_size (stmt_info);
673 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
674 }
675 /* Not a strided access. */
676 else
677 {
678 group_size = 1;
679 first_dr = dr;
680 }
681
682 alignment_support_cheme = vect_supportable_dr_alignment (first_dr);
683
684 /* Is this an access in a group of loads providing strided access?
685 If so, add in the cost of the permutes. */
686 if (group_size > 1)
687 {
688 /* Uses an even and odd extract operations for each needed permute. */
689 inside_cost = ncopies * exact_log2(group_size) * group_size
690 * TARG_VEC_STMT_COST;
691
692 if (vect_print_dump_info (REPORT_COST))
693 fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
694 group_size);
695
696 }
697
698 /* The loads themselves. */
699 switch (alignment_support_cheme)
700 {
701 case dr_aligned:
702 {
703 inside_cost += ncopies * TARG_VEC_LOAD_COST;
704
705 if (vect_print_dump_info (REPORT_COST))
706 fprintf (vect_dump, "vect_model_load_cost: aligned.");
707
708 break;
709 }
710 case dr_unaligned_supported:
711 {
712 /* Here, we assign an additional cost for the unaligned load. */
713 inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST;
714
715 if (vect_print_dump_info (REPORT_COST))
716 fprintf (vect_dump, "vect_model_load_cost: unaligned supported by "
717 "hardware.");
718
719 break;
720 }
721 case dr_explicit_realign:
722 {
723 inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
724
725 /* FIXME: If the misalignment remains fixed across the iterations of
726 the containing loop, the following cost should be added to the
727 outside costs. */
728 if (targetm.vectorize.builtin_mask_for_load)
729 inside_cost += TARG_VEC_STMT_COST;
730
731 break;
732 }
733 case dr_explicit_realign_optimized:
734 {
735 if (vect_print_dump_info (REPORT_COST))
736 fprintf (vect_dump, "vect_model_load_cost: unaligned software "
737 "pipelined.");
738
739 /* Unaligned software pipeline has a load of an address, an initial
740 load, and possibly a mask operation to "prime" the loop. However,
741 if this is an access in a group of loads, which provide strided
742 access, then the above cost should only be considered for one
743 access in the group. Inside the loop, there is a load op
744 and a realignment op. */
745
746 if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
747 {
748 outside_cost = 2*TARG_VEC_STMT_COST;
749 if (targetm.vectorize.builtin_mask_for_load)
750 outside_cost += TARG_VEC_STMT_COST;
751 }
752
753 inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
754
755 break;
756 }
757
758 default:
759 gcc_unreachable ();
760 }
761
762 if (vect_print_dump_info (REPORT_COST))
763 fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
764 "outside_cost = %d .", inside_cost, outside_cost);
765
766 /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */
767 stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost);
768 stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost);
769 }
770
771
772 /* Function vect_get_new_vect_var.
773
774 Returns a name for a new variable. The current naming scheme appends the
775 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
776 the name of vectorizer generated variables, and appends that to NAME if
777 provided. */
778
779 static tree
780 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
781 {
782 const char *prefix;
783 tree new_vect_var;
784
785 switch (var_kind)
786 {
787 case vect_simple_var:
788 prefix = "vect_";
789 break;
790 case vect_scalar_var:
791 prefix = "stmp_";
792 break;
793 case vect_pointer_var:
794 prefix = "vect_p";
795 break;
796 default:
797 gcc_unreachable ();
798 }
799
800 if (name)
801 {
802 char* tmp = concat (prefix, name, NULL);
803 new_vect_var = create_tmp_var (type, tmp);
804 free (tmp);
805 }
806 else
807 new_vect_var = create_tmp_var (type, prefix);
808
809 /* Mark vector typed variable as a gimple register variable. */
810 if (TREE_CODE (type) == VECTOR_TYPE)
811 DECL_GIMPLE_REG_P (new_vect_var) = true;
812
813 return new_vect_var;
814 }
815
816
817 /* Function vect_create_addr_base_for_vector_ref.
818
819 Create an expression that computes the address of the first memory location
820 that will be accessed for a data reference.
821
822 Input:
823 STMT: The statement containing the data reference.
824 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
825 OFFSET: Optional. If supplied, it is be added to the initial address.
826 LOOP: Specify relative to which loop-nest should the address be computed.
827 For example, when the dataref is in an inner-loop nested in an
828 outer-loop that is now being vectorized, LOOP can be either the
829 outer-loop, or the inner-loop. The first memory location accessed
830 by the following dataref ('in' points to short):
831
832 for (i=0; i<N; i++)
833 for (j=0; j<M; j++)
834 s += in[i+j]
835
836 is as follows:
837 if LOOP=i_loop: &in (relative to i_loop)
838 if LOOP=j_loop: &in+i*2B (relative to j_loop)
839
840 Output:
841 1. Return an SSA_NAME whose value is the address of the memory location of
842 the first vector of the data reference.
843 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
844 these statement(s) which define the returned SSA_NAME.
845
846 FORNOW: We are only handling array accesses with step 1. */
847
848 static tree
849 vect_create_addr_base_for_vector_ref (tree stmt,
850 tree *new_stmt_list,
851 tree offset,
852 struct loop *loop)
853 {
854 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
855 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
856 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
857 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
858 tree base_name;
859 tree data_ref_base_var;
860 tree new_base_stmt;
861 tree vec_stmt;
862 tree addr_base, addr_expr;
863 tree dest, new_stmt;
864 tree base_offset = unshare_expr (DR_OFFSET (dr));
865 tree init = unshare_expr (DR_INIT (dr));
866 tree vect_ptr_type, addr_expr2;
867 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
868
869 gcc_assert (loop);
870 if (loop != containing_loop)
871 {
872 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
873 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
874
875 gcc_assert (nested_in_vect_loop_p (loop, stmt));
876
877 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
878 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
879 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
880 }
881
882 /* Create data_ref_base */
883 base_name = build_fold_indirect_ref (data_ref_base);
884 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
885 add_referenced_var (data_ref_base_var);
886 data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt,
887 true, data_ref_base_var);
888 append_to_statement_list_force(new_base_stmt, new_stmt_list);
889
890 /* Create base_offset */
891 base_offset = size_binop (PLUS_EXPR, base_offset, init);
892 base_offset = fold_convert (sizetype, base_offset);
893 dest = create_tmp_var (TREE_TYPE (base_offset), "base_off");
894 add_referenced_var (dest);
895 base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest);
896 append_to_statement_list_force (new_stmt, new_stmt_list);
897
898 if (offset)
899 {
900 tree tmp = create_tmp_var (sizetype, "offset");
901
902 add_referenced_var (tmp);
903 offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step);
904 base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset),
905 base_offset, offset);
906 base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp);
907 append_to_statement_list_force (new_stmt, new_stmt_list);
908 }
909
910 /* base + base_offset */
911 addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
912 data_ref_base, base_offset);
913
914 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
915
916 /* addr_expr = addr_base */
917 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
918 get_name (base_name));
919 add_referenced_var (addr_expr);
920 vec_stmt = fold_convert (vect_ptr_type, addr_base);
921 addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
922 get_name (base_name));
923 add_referenced_var (addr_expr2);
924 vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2);
925 append_to_statement_list_force (new_stmt, new_stmt_list);
926
927 if (vect_print_dump_info (REPORT_DETAILS))
928 {
929 fprintf (vect_dump, "created ");
930 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
931 }
932 return vec_stmt;
933 }
934
935
936 /* Function vect_create_data_ref_ptr.
937
938 Create a new pointer to vector type (vp), that points to the first location
939 accessed in the loop by STMT, along with the def-use update chain to
940 appropriately advance the pointer through the loop iterations. Also set
941 aliasing information for the pointer. This vector pointer is used by the
942 callers to this function to create a memory reference expression for vector
943 load/store access.
944
945 Input:
946 1. STMT: a stmt that references memory. Expected to be of the form
947 GIMPLE_MODIFY_STMT <name, data-ref> or
948 GIMPLE_MODIFY_STMT <data-ref, name>.
949 2. AT_LOOP: the loop where the vector memref is to be created.
950 3. OFFSET (optional): an offset to be added to the initial address accessed
951 by the data-ref in STMT.
952 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain
953 pointing to the initial address.
954 5. TYPE: if not NULL indicates the required type of the data-ref
955
956 Output:
957 1. Declare a new ptr to vector_type, and have it point to the base of the
958 data reference (initial addressed accessed by the data reference).
959 For example, for vector of type V8HI, the following code is generated:
960
961 v8hi *vp;
962 vp = (v8hi *)initial_address;
963
964 if OFFSET is not supplied:
965 initial_address = &a[init];
966 if OFFSET is supplied:
967 initial_address = &a[init + OFFSET];
968
969 Return the initial_address in INITIAL_ADDRESS.
970
971 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
972 update the pointer in each iteration of the loop.
973
974 Return the increment stmt that updates the pointer in PTR_INCR.
975
976 3. Set INV_P to true if the access pattern of the data reference in the
977 vectorized loop is invariant. Set it to false otherwise.
978
979 4. Return the pointer. */
980
981 static tree
982 vect_create_data_ref_ptr (tree stmt, struct loop *at_loop,
983 tree offset, tree *initial_address, tree *ptr_incr,
984 bool only_init, tree type, bool *inv_p)
985 {
986 tree base_name;
987 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
990 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
991 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
992 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
993 tree vect_ptr_type;
994 tree vect_ptr;
995 tree tag;
996 tree new_temp;
997 tree vec_stmt;
998 tree new_stmt_list = NULL_TREE;
999 edge pe;
1000 basic_block new_bb;
1001 tree vect_ptr_init;
1002 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1003 tree vptr;
1004 block_stmt_iterator incr_bsi;
1005 bool insert_after;
1006 tree indx_before_incr, indx_after_incr;
1007 tree incr;
1008 tree step;
1009
1010 /* Check the step (evolution) of the load in LOOP, and record
1011 whether it's invariant. */
1012 if (nested_in_vect_loop)
1013 step = STMT_VINFO_DR_STEP (stmt_info);
1014 else
1015 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
1016
1017 if (tree_int_cst_compare (step, size_zero_node) == 0)
1018 *inv_p = true;
1019 else
1020 *inv_p = false;
1021
1022 /* Create an expression for the first address accessed by this load
1023 in LOOP. */
1024 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
1025
1026 if (vect_print_dump_info (REPORT_DETAILS))
1027 {
1028 tree data_ref_base = base_name;
1029 fprintf (vect_dump, "create vector-pointer variable to type: ");
1030 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1031 if (TREE_CODE (data_ref_base) == VAR_DECL)
1032 fprintf (vect_dump, " vectorizing a one dimensional array ref: ");
1033 else if (TREE_CODE (data_ref_base) == ARRAY_REF)
1034 fprintf (vect_dump, " vectorizing a multidimensional array ref: ");
1035 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
1036 fprintf (vect_dump, " vectorizing a record based array ref: ");
1037 else if (TREE_CODE (data_ref_base) == SSA_NAME)
1038 fprintf (vect_dump, " vectorizing a pointer ref: ");
1039 print_generic_expr (vect_dump, base_name, TDF_SLIM);
1040 }
1041
1042 /** (1) Create the new vector-pointer variable: **/
1043 if (type)
1044 vect_ptr_type = build_pointer_type (type);
1045 else
1046 vect_ptr_type = build_pointer_type (vectype);
1047 vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
1048 get_name (base_name));
1049 add_referenced_var (vect_ptr);
1050
1051 /** (2) Add aliasing information to the new vector-pointer:
1052 (The points-to info (DR_PTR_INFO) may be defined later.) **/
1053
1054 tag = DR_SYMBOL_TAG (dr);
1055 gcc_assert (tag);
1056
1057 /* If tag is a variable (and NOT_A_TAG) than a new symbol memory
1058 tag must be created with tag added to its may alias list. */
1059 if (!MTAG_P (tag))
1060 new_type_alias (vect_ptr, tag, DR_REF (dr));
1061 else
1062 set_symbol_mem_tag (vect_ptr, tag);
1063
1064 var_ann (vect_ptr)->subvars = DR_SUBVARS (dr);
1065
1066 /** Note: If the dataref is in an inner-loop nested in LOOP, and we are
1067 vectorizing LOOP (i.e. outer-loop vectorization), we need to create two
1068 def-use update cycles for the pointer: One relative to the outer-loop
1069 (LOOP), which is what steps (3) and (4) below do. The other is relative
1070 to the inner-loop (which is the inner-most loop containing the dataref),
1071 and this is done be step (5) below.
1072
1073 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
1074 inner-most loop, and so steps (3),(4) work the same, and step (5) is
1075 redundant. Steps (3),(4) create the following:
1076
1077 vp0 = &base_addr;
1078 LOOP: vp1 = phi(vp0,vp2)
1079 ...
1080 ...
1081 vp2 = vp1 + step
1082 goto LOOP
1083
1084 If there is an inner-loop nested in loop, then step (5) will also be
1085 applied, and an additional update in the inner-loop will be created:
1086
1087 vp0 = &base_addr;
1088 LOOP: vp1 = phi(vp0,vp2)
1089 ...
1090 inner: vp3 = phi(vp1,vp4)
1091 vp4 = vp3 + inner_step
1092 if () goto inner
1093 ...
1094 vp2 = vp1 + step
1095 if () goto LOOP */
1096
1097 /** (3) Calculate the initial address the vector-pointer, and set
1098 the vector-pointer to point to it before the loop: **/
1099
1100 /* Create: (&(base[init_val+offset]) in the loop preheader. */
1101
1102 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1103 offset, loop);
1104 pe = loop_preheader_edge (loop);
1105 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list);
1106 gcc_assert (!new_bb);
1107 *initial_address = new_temp;
1108
1109 /* Create: p = (vectype *) initial_base */
1110 vec_stmt = fold_convert (vect_ptr_type, new_temp);
1111 vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt);
1112 vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt);
1113 GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init;
1114 new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt);
1115 gcc_assert (!new_bb);
1116
1117
1118 /** (4) Handle the updating of the vector-pointer inside the loop.
1119 This is needed when ONLY_INIT is false, and also when AT_LOOP
1120 is the inner-loop nested in LOOP (during outer-loop vectorization).
1121 **/
1122
1123 if (only_init && at_loop == loop) /* No update in loop is required. */
1124 {
1125 /* Copy the points-to information if it exists. */
1126 if (DR_PTR_INFO (dr))
1127 duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr));
1128 vptr = vect_ptr_init;
1129 }
1130 else
1131 {
1132 /* The step of the vector pointer is the Vector Size. */
1133 tree step = TYPE_SIZE_UNIT (vectype);
1134 /* One exception to the above is when the scalar step of the load in
1135 LOOP is zero. In this case the step here is also zero. */
1136 if (*inv_p)
1137 step = size_zero_node;
1138
1139 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
1140
1141 create_iv (vect_ptr_init,
1142 fold_convert (vect_ptr_type, step),
1143 NULL_TREE, loop, &incr_bsi, insert_after,
1144 &indx_before_incr, &indx_after_incr);
1145 incr = bsi_stmt (incr_bsi);
1146 set_stmt_info (stmt_ann (incr),
1147 new_stmt_vec_info (incr, loop_vinfo));
1148
1149 /* Copy the points-to information if it exists. */
1150 if (DR_PTR_INFO (dr))
1151 {
1152 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1153 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1154 }
1155 merge_alias_info (vect_ptr_init, indx_before_incr);
1156 merge_alias_info (vect_ptr_init, indx_after_incr);
1157 if (ptr_incr)
1158 *ptr_incr = incr;
1159
1160 vptr = indx_before_incr;
1161 }
1162
1163 if (!nested_in_vect_loop || only_init)
1164 return vptr;
1165
1166
1167 /** (5) Handle the updating of the vector-pointer inside the inner-loop
1168 nested in LOOP, if exists: **/
1169
1170 gcc_assert (nested_in_vect_loop);
1171 if (!only_init)
1172 {
1173 standard_iv_increment_position (containing_loop, &incr_bsi,
1174 &insert_after);
1175 create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE,
1176 containing_loop, &incr_bsi, insert_after, &indx_before_incr,
1177 &indx_after_incr);
1178 incr = bsi_stmt (incr_bsi);
1179 set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo));
1180
1181 /* Copy the points-to information if it exists. */
1182 if (DR_PTR_INFO (dr))
1183 {
1184 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
1185 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
1186 }
1187 merge_alias_info (vect_ptr_init, indx_before_incr);
1188 merge_alias_info (vect_ptr_init, indx_after_incr);
1189 if (ptr_incr)
1190 *ptr_incr = incr;
1191
1192 return indx_before_incr;
1193 }
1194 else
1195 gcc_unreachable ();
1196 }
1197
1198
1199 /* Function bump_vector_ptr
1200
1201 Increment a pointer (to a vector type) by vector-size. If requested,
1202 i.e. if PTR-INCR is given, then also connect the new increment stmt
1203 to the existing def-use update-chain of the pointer, by modifying
1204 the PTR_INCR as illustrated below:
1205
1206 The pointer def-use update-chain before this function:
1207 DATAREF_PTR = phi (p_0, p_2)
1208 ....
1209 PTR_INCR: p_2 = DATAREF_PTR + step
1210
1211 The pointer def-use update-chain after this function:
1212 DATAREF_PTR = phi (p_0, p_2)
1213 ....
1214 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
1215 ....
1216 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
1217
1218 Input:
1219 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
1220 in the loop.
1221 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
1222 the loop. The increment amount across iterations is expected
1223 to be vector_size.
1224 BSI - location where the new update stmt is to be placed.
1225 STMT - the original scalar memory-access stmt that is being vectorized.
1226 BUMP - optional. The offset by which to bump the pointer. If not given,
1227 the offset is assumed to be vector_size.
1228
1229 Output: Return NEW_DATAREF_PTR as illustrated above.
1230
1231 */
1232
1233 static tree
1234 bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi,
1235 tree stmt, tree bump)
1236 {
1237 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1238 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1239 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240 tree vptr_type = TREE_TYPE (dataref_ptr);
1241 tree ptr_var = SSA_NAME_VAR (dataref_ptr);
1242 tree update = TYPE_SIZE_UNIT (vectype);
1243 tree incr_stmt;
1244 ssa_op_iter iter;
1245 use_operand_p use_p;
1246 tree new_dataref_ptr;
1247
1248 if (bump)
1249 update = bump;
1250
1251 incr_stmt = build_gimple_modify_stmt (ptr_var,
1252 build2 (POINTER_PLUS_EXPR, vptr_type,
1253 dataref_ptr, update));
1254 new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt);
1255 GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr;
1256 vect_finish_stmt_generation (stmt, incr_stmt, bsi);
1257
1258 /* Copy the points-to information if it exists. */
1259 if (DR_PTR_INFO (dr))
1260 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
1261 merge_alias_info (new_dataref_ptr, dataref_ptr);
1262
1263 if (!ptr_incr)
1264 return new_dataref_ptr;
1265
1266 /* Update the vector-pointer's cross-iteration increment. */
1267 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
1268 {
1269 tree use = USE_FROM_PTR (use_p);
1270
1271 if (use == dataref_ptr)
1272 SET_USE (use_p, new_dataref_ptr);
1273 else
1274 gcc_assert (tree_int_cst_compare (use, update) == 0);
1275 }
1276
1277 return new_dataref_ptr;
1278 }
1279
1280
1281 /* Function vect_create_destination_var.
1282
1283 Create a new temporary of type VECTYPE. */
1284
1285 static tree
1286 vect_create_destination_var (tree scalar_dest, tree vectype)
1287 {
1288 tree vec_dest;
1289 const char *new_name;
1290 tree type;
1291 enum vect_var_kind kind;
1292
1293 kind = vectype ? vect_simple_var : vect_scalar_var;
1294 type = vectype ? vectype : TREE_TYPE (scalar_dest);
1295
1296 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
1297
1298 new_name = get_name (scalar_dest);
1299 if (!new_name)
1300 new_name = "var_";
1301 vec_dest = vect_get_new_vect_var (type, kind, new_name);
1302 add_referenced_var (vec_dest);
1303
1304 return vec_dest;
1305 }
1306
1307
1308 /* Function vect_init_vector.
1309
1310 Insert a new stmt (INIT_STMT) that initializes a new vector variable with
1311 the vector elements of VECTOR_VAR. Place the initialization at BSI if it
1312 is not NULL. Otherwise, place the initialization at the loop preheader.
1313 Return the DEF of INIT_STMT.
1314 It will be used in the vectorization of STMT. */
1315
1316 static tree
1317 vect_init_vector (tree stmt, tree vector_var, tree vector_type,
1318 block_stmt_iterator *bsi)
1319 {
1320 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1321 tree new_var;
1322 tree init_stmt;
1323 tree vec_oprnd;
1324 edge pe;
1325 tree new_temp;
1326 basic_block new_bb;
1327
1328 new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_");
1329 add_referenced_var (new_var);
1330 init_stmt = build_gimple_modify_stmt (new_var, vector_var);
1331 new_temp = make_ssa_name (new_var, init_stmt);
1332 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp;
1333
1334 if (bsi)
1335 vect_finish_stmt_generation (stmt, init_stmt, bsi);
1336 else
1337 {
1338 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1339 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1340
1341 if (nested_in_vect_loop_p (loop, stmt))
1342 loop = loop->inner;
1343 pe = loop_preheader_edge (loop);
1344 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1345 gcc_assert (!new_bb);
1346 }
1347
1348 if (vect_print_dump_info (REPORT_DETAILS))
1349 {
1350 fprintf (vect_dump, "created new init_stmt: ");
1351 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1352 }
1353
1354 vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0);
1355 return vec_oprnd;
1356 }
1357
1358
1359 /* For constant and loop invariant defs of SLP_NODE this function returns
1360 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
1361 OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
1362 stmts. */
1363
1364 static void
1365 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
1366 unsigned int op_num)
1367 {
1368 VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
1369 tree stmt = VEC_index (tree, stmts, 0);
1370 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1371 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1372 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1373 tree vec_cst;
1374 tree t = NULL_TREE;
1375 int j, number_of_places_left_in_vector;
1376 tree vector_type;
1377 tree op, vop, operation;
1378 int group_size = VEC_length (tree, stmts);
1379 unsigned int vec_num, i;
1380 int number_of_copies = 1;
1381 bool is_store = false;
1382 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
1383 VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
1384 bool constant_p;
1385
1386 if (STMT_VINFO_DATA_REF (stmt_vinfo))
1387 is_store = true;
1388
1389 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
1390 created vectors. It is greater than 1 if unrolling is performed.
1391
1392 For example, we have two scalar operands, s1 and s2 (e.g., group of
1393 strided accesses of size two), while NUINTS is four (i.e., four scalars
1394 of this type can be packed in a vector). The output vector will contain
1395 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
1396 will be 2).
1397
1398 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
1399 containing the operands.
1400
1401 For example, NUINTS is four as before, and the group size is 8
1402 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
1403 {s5, s6, s7, s8}. */
1404
1405 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
1406
1407 number_of_places_left_in_vector = nunits;
1408 constant_p = true;
1409 for (j = 0; j < number_of_copies; j++)
1410 {
1411 for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
1412 {
1413 operation = GIMPLE_STMT_OPERAND (stmt, 1);
1414 if (is_store)
1415 op = operation;
1416 else
1417 op = TREE_OPERAND (operation, op_num);
1418 if (!CONSTANT_CLASS_P (op))
1419 constant_p = false;
1420
1421 /* Create 'vect_ = {op0,op1,...,opn}'. */
1422 t = tree_cons (NULL_TREE, op, t);
1423
1424 number_of_places_left_in_vector--;
1425
1426 if (number_of_places_left_in_vector == 0)
1427 {
1428 number_of_places_left_in_vector = nunits;
1429
1430 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1431 gcc_assert (vector_type);
1432 if (constant_p)
1433 vec_cst = build_vector (vector_type, t);
1434 else
1435 vec_cst = build_constructor_from_list (vector_type, t);
1436 constant_p = true;
1437 VEC_quick_push (tree, voprnds,
1438 vect_init_vector (stmt, vec_cst, vector_type,
1439 NULL));
1440 t = NULL_TREE;
1441 }
1442 }
1443 }
1444
1445 /* Since the vectors are created in the reverse order, we should invert
1446 them. */
1447 vec_num = VEC_length (tree, voprnds);
1448 for (j = vec_num - 1; j >= 0; j--)
1449 {
1450 vop = VEC_index (tree, voprnds, j);
1451 VEC_quick_push (tree, *vec_oprnds, vop);
1452 }
1453
1454 VEC_free (tree, heap, voprnds);
1455
1456 /* In case that VF is greater than the unrolling factor needed for the SLP
1457 group of stmts, NUMBER_OF_VECTORS to be created is greater than
1458 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
1459 to replicate the vectors. */
1460 while (number_of_vectors > VEC_length (tree, *vec_oprnds))
1461 {
1462 for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
1463 VEC_quick_push (tree, *vec_oprnds, vop);
1464 }
1465 }
1466
1467
1468 /* Get vectorized definitions from SLP_NODE that contains corresponding
1469 vectorized def-stmts. */
1470
1471 static void
1472 vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
1473 {
1474 tree vec_oprnd;
1475 tree vec_def_stmt;
1476 unsigned int i;
1477
1478 gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
1479
1480 for (i = 0;
1481 VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
1482 i++)
1483 {
1484 gcc_assert (vec_def_stmt);
1485 vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
1486 VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
1487 }
1488 }
1489
1490
1491 /* Get vectorized definitions for SLP_NODE.
1492 If the scalar definitions are loop invariants or constants, collect them and
1493 call vect_get_constant_vectors() to create vector stmts.
1494 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
1495 must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
1496 vect_get_slp_vect_defs() to retrieve them.
1497 If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from
1498 the right node. This is used when the second operand must remain scalar. */
1499
1500 static void
1501 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
1502 VEC (tree,heap) **vec_oprnds1)
1503 {
1504 tree operation, first_stmt;
1505
1506 /* Allocate memory for vectorized defs. */
1507 *vec_oprnds0 = VEC_alloc (tree, heap,
1508 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1509
1510 /* SLP_NODE corresponds either to a group of stores or to a group of
1511 unary/binary operations. We don't call this function for loads. */
1512 if (SLP_TREE_LEFT (slp_node))
1513 /* The defs are already vectorized. */
1514 vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
1515 else
1516 /* Build vectors from scalar defs. */
1517 vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
1518
1519 first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
1520 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
1521 /* Since we don't call this function with loads, this is a group of
1522 stores. */
1523 return;
1524
1525 operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
1526 if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1)
1527 return;
1528
1529 *vec_oprnds1 = VEC_alloc (tree, heap,
1530 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
1531
1532 if (SLP_TREE_RIGHT (slp_node))
1533 /* The defs are already vectorized. */
1534 vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
1535 else
1536 /* Build vectors from scalar defs. */
1537 vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
1538 }
1539
1540
1541 /* Function get_initial_def_for_induction
1542
1543 Input:
1544 STMT - a stmt that performs an induction operation in the loop.
1545 IV_PHI - the initial value of the induction variable
1546
1547 Output:
1548 Return a vector variable, initialized with the first VF values of
1549 the induction variable. E.g., for an iv with IV_PHI='X' and
1550 evolution S, for a vector of 4 units, we want to return:
1551 [X, X + S, X + 2*S, X + 3*S]. */
1552
1553 static tree
1554 get_initial_def_for_induction (tree iv_phi)
1555 {
1556 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
1557 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1558 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1559 tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi));
1560 tree vectype;
1561 int nunits;
1562 edge pe = loop_preheader_edge (loop);
1563 struct loop *iv_loop;
1564 basic_block new_bb;
1565 tree vec, vec_init, vec_step, t;
1566 tree access_fn;
1567 tree new_var;
1568 tree new_name;
1569 tree init_stmt;
1570 tree induction_phi, induc_def, new_stmt, vec_def, vec_dest;
1571 tree init_expr, step_expr;
1572 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1573 int i;
1574 bool ok;
1575 int ncopies;
1576 tree expr;
1577 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
1578 bool nested_in_vect_loop = false;
1579 tree stmts;
1580 imm_use_iterator imm_iter;
1581 use_operand_p use_p;
1582 tree exit_phi;
1583 edge latch_e;
1584 tree loop_arg;
1585 block_stmt_iterator si;
1586 basic_block bb = bb_for_stmt (iv_phi);
1587
1588 vectype = get_vectype_for_scalar_type (scalar_type);
1589 gcc_assert (vectype);
1590 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1591 ncopies = vf / nunits;
1592
1593 gcc_assert (phi_info);
1594 gcc_assert (ncopies >= 1);
1595
1596 /* Find the first insertion point in the BB. */
1597 si = bsi_after_labels (bb);
1598
1599 if (INTEGRAL_TYPE_P (scalar_type))
1600 step_expr = build_int_cst (scalar_type, 0);
1601 else
1602 step_expr = build_real (scalar_type, dconst0);
1603
1604 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
1605 if (nested_in_vect_loop_p (loop, iv_phi))
1606 {
1607 nested_in_vect_loop = true;
1608 iv_loop = loop->inner;
1609 }
1610 else
1611 iv_loop = loop;
1612 gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father);
1613
1614 latch_e = loop_latch_edge (iv_loop);
1615 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
1616
1617 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
1618 gcc_assert (access_fn);
1619 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
1620 &init_expr, &step_expr);
1621 gcc_assert (ok);
1622 pe = loop_preheader_edge (iv_loop);
1623
1624 /* Create the vector that holds the initial_value of the induction. */
1625 if (nested_in_vect_loop)
1626 {
1627 /* iv_loop is nested in the loop to be vectorized. init_expr had already
1628 been created during vectorization of previous stmts; We obtain it from
1629 the STMT_VINFO_VEC_STMT of the defining stmt. */
1630 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
1631 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
1632 }
1633 else
1634 {
1635 /* iv_loop is the loop to be vectorized. Create:
1636 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
1637 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
1638 add_referenced_var (new_var);
1639
1640 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
1641 if (stmts)
1642 {
1643 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
1644 gcc_assert (!new_bb);
1645 }
1646
1647 t = NULL_TREE;
1648 t = tree_cons (NULL_TREE, init_expr, t);
1649 for (i = 1; i < nunits; i++)
1650 {
1651 tree tmp;
1652
1653 /* Create: new_name_i = new_name + step_expr */
1654 tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr);
1655 init_stmt = build_gimple_modify_stmt (new_var, tmp);
1656 new_name = make_ssa_name (new_var, init_stmt);
1657 GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name;
1658
1659 new_bb = bsi_insert_on_edge_immediate (pe, init_stmt);
1660 gcc_assert (!new_bb);
1661
1662 if (vect_print_dump_info (REPORT_DETAILS))
1663 {
1664 fprintf (vect_dump, "created new init_stmt: ");
1665 print_generic_expr (vect_dump, init_stmt, TDF_SLIM);
1666 }
1667 t = tree_cons (NULL_TREE, new_name, t);
1668 }
1669 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
1670 vec = build_constructor_from_list (vectype, nreverse (t));
1671 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
1672 }
1673
1674
1675 /* Create the vector that holds the step of the induction. */
1676 if (nested_in_vect_loop)
1677 /* iv_loop is nested in the loop to be vectorized. Generate:
1678 vec_step = [S, S, S, S] */
1679 new_name = step_expr;
1680 else
1681 {
1682 /* iv_loop is the loop to be vectorized. Generate:
1683 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1684 expr = build_int_cst (scalar_type, vf);
1685 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1686 }
1687
1688 t = NULL_TREE;
1689 for (i = 0; i < nunits; i++)
1690 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1691 gcc_assert (CONSTANT_CLASS_P (new_name));
1692 vec = build_vector (vectype, t);
1693 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1694
1695
1696 /* Create the following def-use cycle:
1697 loop prolog:
1698 vec_init = ...
1699 vec_step = ...
1700 loop:
1701 vec_iv = PHI <vec_init, vec_loop>
1702 ...
1703 STMT
1704 ...
1705 vec_loop = vec_iv + vec_step; */
1706
1707 /* Create the induction-phi that defines the induction-operand. */
1708 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
1709 add_referenced_var (vec_dest);
1710 induction_phi = create_phi_node (vec_dest, iv_loop->header);
1711 set_stmt_info (get_stmt_ann (induction_phi),
1712 new_stmt_vec_info (induction_phi, loop_vinfo));
1713 induc_def = PHI_RESULT (induction_phi);
1714
1715 /* Create the iv update inside the loop */
1716 new_stmt = build_gimple_modify_stmt (NULL_TREE,
1717 build2 (PLUS_EXPR, vectype,
1718 induc_def, vec_step));
1719 vec_def = make_ssa_name (vec_dest, new_stmt);
1720 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1721 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1722 set_stmt_info (get_stmt_ann (new_stmt),
1723 new_stmt_vec_info (new_stmt, loop_vinfo));
1724
1725 /* Set the arguments of the phi node: */
1726 add_phi_arg (induction_phi, vec_init, pe);
1727 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
1728
1729
1730 /* In case that vectorization factor (VF) is bigger than the number
1731 of elements that we can fit in a vectype (nunits), we have to generate
1732 more than one vector stmt - i.e - we need to "unroll" the
1733 vector stmt by a factor VF/nunits. For more details see documentation
1734 in vectorizable_operation. */
1735
1736 if (ncopies > 1)
1737 {
1738 stmt_vec_info prev_stmt_vinfo;
1739 /* FORNOW. This restriction should be relaxed. */
1740 gcc_assert (!nested_in_vect_loop);
1741
1742 /* Create the vector that holds the step of the induction. */
1743 expr = build_int_cst (scalar_type, nunits);
1744 new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr);
1745 t = NULL_TREE;
1746 for (i = 0; i < nunits; i++)
1747 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
1748 gcc_assert (CONSTANT_CLASS_P (new_name));
1749 vec = build_vector (vectype, t);
1750 vec_step = vect_init_vector (iv_phi, vec, vectype, NULL);
1751
1752 vec_def = induc_def;
1753 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
1754 for (i = 1; i < ncopies; i++)
1755 {
1756 tree tmp;
1757
1758 /* vec_i = vec_prev + vec_step */
1759 tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step);
1760 new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp);
1761 vec_def = make_ssa_name (vec_dest, new_stmt);
1762 GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def;
1763 bsi_insert_before (&si, new_stmt, BSI_SAME_STMT);
1764 set_stmt_info (get_stmt_ann (new_stmt),
1765 new_stmt_vec_info (new_stmt, loop_vinfo));
1766 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
1767 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
1768 }
1769 }
1770
1771 if (nested_in_vect_loop)
1772 {
1773 /* Find the loop-closed exit-phi of the induction, and record
1774 the final vector of induction results: */
1775 exit_phi = NULL;
1776 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
1777 {
1778 if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p))))
1779 {
1780 exit_phi = USE_STMT (use_p);
1781 break;
1782 }
1783 }
1784 if (exit_phi)
1785 {
1786 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
1787 /* FORNOW. Currently not supporting the case that an inner-loop induction
1788 is not used in the outer-loop (i.e. only outside the outer-loop). */
1789 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
1790 && !STMT_VINFO_LIVE_P (stmt_vinfo));
1791
1792 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
1793 if (vect_print_dump_info (REPORT_DETAILS))
1794 {
1795 fprintf (vect_dump, "vector of inductions after inner-loop:");
1796 print_generic_expr (vect_dump, new_stmt, TDF_SLIM);
1797 }
1798 }
1799 }
1800
1801
1802 if (vect_print_dump_info (REPORT_DETAILS))
1803 {
1804 fprintf (vect_dump, "transform induction: created def-use cycle:");
1805 print_generic_expr (vect_dump, induction_phi, TDF_SLIM);
1806 fprintf (vect_dump, "\n");
1807 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM);
1808 }
1809
1810 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
1811 return induc_def;
1812 }
1813
1814
1815 /* Function vect_get_vec_def_for_operand.
1816
1817 OP is an operand in STMT. This function returns a (vector) def that will be
1818 used in the vectorized stmt for STMT.
1819
1820 In the case that OP is an SSA_NAME which is defined in the loop, then
1821 STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def.
1822
1823 In case OP is an invariant or constant, a new stmt that creates a vector def
1824 needs to be introduced. */
1825
1826 static tree
1827 vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def)
1828 {
1829 tree vec_oprnd;
1830 tree vec_stmt;
1831 tree def_stmt;
1832 stmt_vec_info def_stmt_info = NULL;
1833 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1834 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
1835 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
1836 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1837 tree vec_inv;
1838 tree vec_cst;
1839 tree t = NULL_TREE;
1840 tree def;
1841 int i;
1842 enum vect_def_type dt;
1843 bool is_simple_use;
1844 tree vector_type;
1845
1846 if (vect_print_dump_info (REPORT_DETAILS))
1847 {
1848 fprintf (vect_dump, "vect_get_vec_def_for_operand: ");
1849 print_generic_expr (vect_dump, op, TDF_SLIM);
1850 }
1851
1852 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
1853 gcc_assert (is_simple_use);
1854 if (vect_print_dump_info (REPORT_DETAILS))
1855 {
1856 if (def)
1857 {
1858 fprintf (vect_dump, "def = ");
1859 print_generic_expr (vect_dump, def, TDF_SLIM);
1860 }
1861 if (def_stmt)
1862 {
1863 fprintf (vect_dump, " def_stmt = ");
1864 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
1865 }
1866 }
1867
1868 switch (dt)
1869 {
1870 /* Case 1: operand is a constant. */
1871 case vect_constant_def:
1872 {
1873 if (scalar_def)
1874 *scalar_def = op;
1875
1876 /* Create 'vect_cst_ = {cst,cst,...,cst}' */
1877 if (vect_print_dump_info (REPORT_DETAILS))
1878 fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits);
1879
1880 for (i = nunits - 1; i >= 0; --i)
1881 {
1882 t = tree_cons (NULL_TREE, op, t);
1883 }
1884 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
1885 gcc_assert (vector_type);
1886 vec_cst = build_vector (vector_type, t);
1887
1888 return vect_init_vector (stmt, vec_cst, vector_type, NULL);
1889 }
1890
1891 /* Case 2: operand is defined outside the loop - loop invariant. */
1892 case vect_invariant_def:
1893 {
1894 if (scalar_def)
1895 *scalar_def = def;
1896
1897 /* Create 'vec_inv = {inv,inv,..,inv}' */
1898 if (vect_print_dump_info (REPORT_DETAILS))
1899 fprintf (vect_dump, "Create vector_inv.");
1900
1901 for (i = nunits - 1; i >= 0; --i)
1902 {
1903 t = tree_cons (NULL_TREE, def, t);
1904 }
1905
1906 /* FIXME: use build_constructor directly. */
1907 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def));
1908 gcc_assert (vector_type);
1909 vec_inv = build_constructor_from_list (vector_type, t);
1910 return vect_init_vector (stmt, vec_inv, vector_type, NULL);
1911 }
1912
1913 /* Case 3: operand is defined inside the loop. */
1914 case vect_loop_def:
1915 {
1916 if (scalar_def)
1917 *scalar_def = def_stmt;
1918
1919 /* Get the def from the vectorized stmt. */
1920 def_stmt_info = vinfo_for_stmt (def_stmt);
1921 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1922 gcc_assert (vec_stmt);
1923 if (TREE_CODE (vec_stmt) == PHI_NODE)
1924 vec_oprnd = PHI_RESULT (vec_stmt);
1925 else
1926 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0);
1927 return vec_oprnd;
1928 }
1929
1930 /* Case 4: operand is defined by a loop header phi - reduction */
1931 case vect_reduction_def:
1932 {
1933 struct loop *loop;
1934
1935 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1936 loop = (bb_for_stmt (def_stmt))->loop_father;
1937
1938 /* Get the def before the loop */
1939 op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
1940 return get_initial_def_for_reduction (stmt, op, scalar_def);
1941 }
1942
1943 /* Case 5: operand is defined by loop-header phi - induction. */
1944 case vect_induction_def:
1945 {
1946 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
1947
1948 /* Get the def from the vectorized stmt. */
1949 def_stmt_info = vinfo_for_stmt (def_stmt);
1950 vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info);
1951 gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE));
1952 vec_oprnd = PHI_RESULT (vec_stmt);
1953 return vec_oprnd;
1954 }
1955
1956 default:
1957 gcc_unreachable ();
1958 }
1959 }
1960
1961
1962 /* Function vect_get_vec_def_for_stmt_copy
1963
1964 Return a vector-def for an operand. This function is used when the
1965 vectorized stmt to be created (by the caller to this function) is a "copy"
1966 created in case the vectorized result cannot fit in one vector, and several
1967 copies of the vector-stmt are required. In this case the vector-def is
1968 retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field
1969 of the stmt that defines VEC_OPRND.
1970 DT is the type of the vector def VEC_OPRND.
1971
1972 Context:
1973 In case the vectorization factor (VF) is bigger than the number
1974 of elements that can fit in a vectype (nunits), we have to generate
1975 more than one vector stmt to vectorize the scalar stmt. This situation
1976 arises when there are multiple data-types operated upon in the loop; the
1977 smallest data-type determines the VF, and as a result, when vectorizing
1978 stmts operating on wider types we need to create 'VF/nunits' "copies" of the
1979 vector stmt (each computing a vector of 'nunits' results, and together
1980 computing 'VF' results in each iteration). This function is called when
1981 vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in
1982 which VF=16 and nunits=4, so the number of copies required is 4):
1983
1984 scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT
1985
1986 S1: x = load VS1.0: vx.0 = memref0 VS1.1
1987 VS1.1: vx.1 = memref1 VS1.2
1988 VS1.2: vx.2 = memref2 VS1.3
1989 VS1.3: vx.3 = memref3
1990
1991 S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1
1992 VSnew.1: vz1 = vx.1 + ... VSnew.2
1993 VSnew.2: vz2 = vx.2 + ... VSnew.3
1994 VSnew.3: vz3 = vx.3 + ...
1995
1996 The vectorization of S1 is explained in vectorizable_load.
1997 The vectorization of S2:
1998 To create the first vector-stmt out of the 4 copies - VSnew.0 -
1999 the function 'vect_get_vec_def_for_operand' is called to
2000 get the relevant vector-def for each operand of S2. For operand x it
2001 returns the vector-def 'vx.0'.
2002
2003 To create the remaining copies of the vector-stmt (VSnew.j), this
2004 function is called to get the relevant vector-def for each operand. It is
2005 obtained from the respective VS1.j stmt, which is recorded in the
2006 STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND.
2007
2008 For example, to obtain the vector-def 'vx.1' in order to create the
2009 vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'.
2010 Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the
2011 STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1',
2012 and return its def ('vx.1').
2013 Overall, to create the above sequence this function will be called 3 times:
2014 vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0);
2015 vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1);
2016 vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */
2017
2018 static tree
2019 vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
2020 {
2021 tree vec_stmt_for_operand;
2022 stmt_vec_info def_stmt_info;
2023
2024 /* Do nothing; can reuse same def. */
2025 if (dt == vect_invariant_def || dt == vect_constant_def )
2026 return vec_oprnd;
2027
2028 vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd);
2029 def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand);
2030 gcc_assert (def_stmt_info);
2031 vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info);
2032 gcc_assert (vec_stmt_for_operand);
2033 vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0);
2034 return vec_oprnd;
2035 }
2036
2037
2038 /* Get vectorized definitions for the operands to create a copy of an original
2039 stmt. See vect_get_vec_def_for_stmt_copy() for details. */
2040
2041 static void
2042 vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
2043 VEC(tree,heap) **vec_oprnds0,
2044 VEC(tree,heap) **vec_oprnds1)
2045 {
2046 tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
2047
2048 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
2049 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2050
2051 if (vec_oprnds1 && *vec_oprnds1)
2052 {
2053 vec_oprnd = VEC_pop (tree, *vec_oprnds1);
2054 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
2055 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2056 }
2057 }
2058
2059
2060 /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
2061
2062 static void
2063 vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
2064 VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
2065 {
2066 if (slp_node)
2067 vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
2068 else
2069 {
2070 tree vec_oprnd;
2071
2072 *vec_oprnds0 = VEC_alloc (tree, heap, 1);
2073 vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
2074 VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
2075
2076 if (op1)
2077 {
2078 *vec_oprnds1 = VEC_alloc (tree, heap, 1);
2079 vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
2080 VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
2081 }
2082 }
2083 }
2084
2085
2086 /* Function vect_finish_stmt_generation.
2087
2088 Insert a new stmt. */
2089
2090 static void
2091 vect_finish_stmt_generation (tree stmt, tree vec_stmt,
2092 block_stmt_iterator *bsi)
2093 {
2094 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2095 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2096
2097 gcc_assert (stmt == bsi_stmt (*bsi));
2098 gcc_assert (TREE_CODE (stmt) != LABEL_EXPR);
2099
2100 bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT);
2101
2102 set_stmt_info (get_stmt_ann (vec_stmt),
2103 new_stmt_vec_info (vec_stmt, loop_vinfo));
2104
2105 if (vect_print_dump_info (REPORT_DETAILS))
2106 {
2107 fprintf (vect_dump, "add new stmt: ");
2108 print_generic_expr (vect_dump, vec_stmt, TDF_SLIM);
2109 }
2110
2111 /* Make sure bsi points to the stmt that is being vectorized. */
2112 gcc_assert (stmt == bsi_stmt (*bsi));
2113
2114 #ifdef USE_MAPPED_LOCATION
2115 SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt));
2116 #else
2117 SET_EXPR_LOCUS (vec_stmt, EXPR_LOCUS (stmt));
2118 #endif
2119 }
2120
2121
2122 /* Function get_initial_def_for_reduction
2123
2124 Input:
2125 STMT - a stmt that performs a reduction operation in the loop.
2126 INIT_VAL - the initial value of the reduction variable
2127
2128 Output:
2129 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2130 of the reduction (used for adjusting the epilog - see below).
2131 Return a vector variable, initialized according to the operation that STMT
2132 performs. This vector will be used as the initial value of the
2133 vector of partial results.
2134
2135 Option1 (adjust in epilog): Initialize the vector as follows:
2136 add: [0,0,...,0,0]
2137 mult: [1,1,...,1,1]
2138 min/max: [init_val,init_val,..,init_val,init_val]
2139 bit and/or: [init_val,init_val,..,init_val,init_val]
2140 and when necessary (e.g. add/mult case) let the caller know
2141 that it needs to adjust the result by init_val.
2142
2143 Option2: Initialize the vector as follows:
2144 add: [0,0,...,0,init_val]
2145 mult: [1,1,...,1,init_val]
2146 min/max: [init_val,init_val,...,init_val]
2147 bit and/or: [init_val,init_val,...,init_val]
2148 and no adjustments are needed.
2149
2150 For example, for the following code:
2151
2152 s = init_val;
2153 for (i=0;i<n;i++)
2154 s = s + a[i];
2155
2156 STMT is 's = s + a[i]', and the reduction variable is 's'.
2157 For a vector of 4 units, we want to return either [0,0,0,init_val],
2158 or [0,0,0,0] and let the caller know that it needs to adjust
2159 the result at the end by 'init_val'.
2160
2161 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2162 initialization vector is simpler (same element in all entries).
2163 A cost model should help decide between these two schemes. */
2164
2165 static tree
2166 get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def)
2167 {
2168 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2169 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2170 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2171 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2172 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2173 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2174 tree type = TREE_TYPE (init_val);
2175 tree vecdef;
2176 tree def_for_init;
2177 tree init_def;
2178 tree t = NULL_TREE;
2179 int i;
2180 tree vector_type;
2181 bool nested_in_vect_loop = false;
2182
2183 gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type));
2184 if (nested_in_vect_loop_p (loop, stmt))
2185 nested_in_vect_loop = true;
2186 else
2187 gcc_assert (loop == (bb_for_stmt (stmt))->loop_father);
2188
2189 vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2190
2191 switch (code)
2192 {
2193 case WIDEN_SUM_EXPR:
2194 case DOT_PROD_EXPR:
2195 case PLUS_EXPR:
2196 if (nested_in_vect_loop)
2197 *adjustment_def = vecdef;
2198 else
2199 *adjustment_def = init_val;
2200 /* Create a vector of zeros for init_def. */
2201 if (SCALAR_FLOAT_TYPE_P (type))
2202 def_for_init = build_real (type, dconst0);
2203 else
2204 def_for_init = build_int_cst (type, 0);
2205 for (i = nunits - 1; i >= 0; --i)
2206 t = tree_cons (NULL_TREE, def_for_init, t);
2207 vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init));
2208 gcc_assert (vector_type);
2209 init_def = build_vector (vector_type, t);
2210 break;
2211
2212 case MIN_EXPR:
2213 case MAX_EXPR:
2214 *adjustment_def = NULL_TREE;
2215 init_def = vecdef;
2216 break;
2217
2218 default:
2219 gcc_unreachable ();
2220 }
2221
2222 return init_def;
2223 }
2224
2225
2226 /* Function vect_create_epilog_for_reduction
2227
2228 Create code at the loop-epilog to finalize the result of a reduction
2229 computation.
2230
2231 VECT_DEF is a vector of partial results.
2232 REDUC_CODE is the tree-code for the epilog reduction.
2233 STMT is the scalar reduction stmt that is being vectorized.
2234 REDUCTION_PHI is the phi-node that carries the reduction computation.
2235
2236 This function:
2237 1. Creates the reduction def-use cycle: sets the arguments for
2238 REDUCTION_PHI:
2239 The loop-entry argument is the vectorized initial-value of the reduction.
2240 The loop-latch argument is VECT_DEF - the vector of partial sums.
2241 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2242 by applying the operation specified by REDUC_CODE if available, or by
2243 other means (whole-vector shifts or a scalar loop).
2244 The function also creates a new phi node at the loop exit to preserve
2245 loop-closed form, as illustrated below.
2246
2247 The flow at the entry to this function:
2248
2249 loop:
2250 vec_def = phi <null, null> # REDUCTION_PHI
2251 VECT_DEF = vector_stmt # vectorized form of STMT
2252 s_loop = scalar_stmt # (scalar) STMT
2253 loop_exit:
2254 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2255 use <s_out0>
2256 use <s_out0>
2257
2258 The above is transformed by this function into:
2259
2260 loop:
2261 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2262 VECT_DEF = vector_stmt # vectorized form of STMT
2263 s_loop = scalar_stmt # (scalar) STMT
2264 loop_exit:
2265 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2266 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2267 v_out2 = reduce <v_out1>
2268 s_out3 = extract_field <v_out2, 0>
2269 s_out4 = adjust_result <s_out3>
2270 use <s_out4>
2271 use <s_out4>
2272 */
2273
2274 static void
2275 vect_create_epilog_for_reduction (tree vect_def, tree stmt,
2276 enum tree_code reduc_code, tree reduction_phi)
2277 {
2278 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2279 tree vectype;
2280 enum machine_mode mode;
2281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2282 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2283 basic_block exit_bb;
2284 tree scalar_dest;
2285 tree scalar_type;
2286 tree new_phi;
2287 block_stmt_iterator exit_bsi;
2288 tree vec_dest;
2289 tree new_temp = NULL_TREE;
2290 tree new_name;
2291 tree epilog_stmt = NULL_TREE;
2292 tree new_scalar_dest, exit_phi, new_dest;
2293 tree bitsize, bitpos, bytesize;
2294 enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1));
2295 tree adjustment_def;
2296 tree vec_initial_def;
2297 tree orig_name;
2298 imm_use_iterator imm_iter;
2299 use_operand_p use_p;
2300 bool extract_scalar_result = false;
2301 tree reduction_op, expr;
2302 tree orig_stmt;
2303 tree use_stmt;
2304 tree operation = GIMPLE_STMT_OPERAND (stmt, 1);
2305 bool nested_in_vect_loop = false;
2306 int op_type;
2307 VEC(tree,heap) *phis = NULL;
2308 int i;
2309
2310 if (nested_in_vect_loop_p (loop, stmt))
2311 {
2312 loop = loop->inner;
2313 nested_in_vect_loop = true;
2314 }
2315
2316 op_type = TREE_OPERAND_LENGTH (operation);
2317 reduction_op = TREE_OPERAND (operation, op_type-1);
2318 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2319 gcc_assert (vectype);
2320 mode = TYPE_MODE (vectype);
2321
2322 /*** 1. Create the reduction def-use cycle ***/
2323
2324 /* 1.1 set the loop-entry arg of the reduction-phi: */
2325 /* For the case of reduction, vect_get_vec_def_for_operand returns
2326 the scalar def before the loop, that defines the initial value
2327 of the reduction variable. */
2328 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2329 &adjustment_def);
2330 add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
2331
2332 /* 1.2 set the loop-latch arg for the reduction-phi: */
2333 add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
2334
2335 if (vect_print_dump_info (REPORT_DETAILS))
2336 {
2337 fprintf (vect_dump, "transform reduction: created def-use cycle:");
2338 print_generic_expr (vect_dump, reduction_phi, TDF_SLIM);
2339 fprintf (vect_dump, "\n");
2340 print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM);
2341 }
2342
2343
2344 /*** 2. Create epilog code
2345 The reduction epilog code operates across the elements of the vector
2346 of partial results computed by the vectorized loop.
2347 The reduction epilog code consists of:
2348 step 1: compute the scalar result in a vector (v_out2)
2349 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2350 step 3: adjust the scalar result (s_out3) if needed.
2351
2352 Step 1 can be accomplished using one the following three schemes:
2353 (scheme 1) using reduc_code, if available.
2354 (scheme 2) using whole-vector shifts, if available.
2355 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2356 combined.
2357
2358 The overall epilog code looks like this:
2359
2360 s_out0 = phi <s_loop> # original EXIT_PHI
2361 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2362 v_out2 = reduce <v_out1> # step 1
2363 s_out3 = extract_field <v_out2, 0> # step 2
2364 s_out4 = adjust_result <s_out3> # step 3
2365
2366 (step 3 is optional, and step2 1 and 2 may be combined).
2367 Lastly, the uses of s_out0 are replaced by s_out4.
2368
2369 ***/
2370
2371 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2372 v_out1 = phi <v_loop> */
2373
2374 exit_bb = single_exit (loop)->dest;
2375 new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2376 SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def);
2377 exit_bsi = bsi_after_labels (exit_bb);
2378
2379 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2380 (i.e. when reduc_code is not available) and in the final adjustment
2381 code (if needed). Also get the original scalar reduction variable as
2382 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2383 represents a reduction pattern), the tree-code and scalar-def are
2384 taken from the original stmt that the pattern-stmt (STMT) replaces.
2385 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2386 are taken from STMT. */
2387
2388 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2389 if (!orig_stmt)
2390 {
2391 /* Regular reduction */
2392 orig_stmt = stmt;
2393 }
2394 else
2395 {
2396 /* Reduction pattern */
2397 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2398 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2399 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2400 }
2401 code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2402 scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0);
2403 scalar_type = TREE_TYPE (scalar_dest);
2404 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2405 bitsize = TYPE_SIZE (scalar_type);
2406 bytesize = TYPE_SIZE_UNIT (scalar_type);
2407
2408
2409 /* In case this is a reduction in an inner-loop while vectorizing an outer
2410 loop - we don't need to extract a single scalar result at the end of the
2411 inner-loop. The final vector of partial results will be used in the
2412 vectorized outer-loop, or reduced to a scalar result at the end of the
2413 outer-loop. */
2414 if (nested_in_vect_loop)
2415 goto vect_finalize_reduction;
2416
2417 /* 2.3 Create the reduction code, using one of the three schemes described
2418 above. */
2419
2420 if (reduc_code < NUM_TREE_CODES)
2421 {
2422 tree tmp;
2423
2424 /*** Case 1: Create:
2425 v_out2 = reduc_expr <v_out1> */
2426
2427 if (vect_print_dump_info (REPORT_DETAILS))
2428 fprintf (vect_dump, "Reduce using direct vector reduction.");
2429
2430 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2431 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2432 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2433 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2434 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2435 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2436
2437 extract_scalar_result = true;
2438 }
2439 else
2440 {
2441 enum tree_code shift_code = 0;
2442 bool have_whole_vector_shift = true;
2443 int bit_offset;
2444 int element_bitsize = tree_low_cst (bitsize, 1);
2445 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2446 tree vec_temp;
2447
2448 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2449 shift_code = VEC_RSHIFT_EXPR;
2450 else
2451 have_whole_vector_shift = false;
2452
2453 /* Regardless of whether we have a whole vector shift, if we're
2454 emulating the operation via tree-vect-generic, we don't want
2455 to use it. Only the first round of the reduction is likely
2456 to still be profitable via emulation. */
2457 /* ??? It might be better to emit a reduction tree code here, so that
2458 tree-vect-generic can expand the first round via bit tricks. */
2459 if (!VECTOR_MODE_P (mode))
2460 have_whole_vector_shift = false;
2461 else
2462 {
2463 optab optab = optab_for_tree_code (code, vectype);
2464 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2465 have_whole_vector_shift = false;
2466 }
2467
2468 if (have_whole_vector_shift)
2469 {
2470 /*** Case 2: Create:
2471 for (offset = VS/2; offset >= element_size; offset/=2)
2472 {
2473 Create: va' = vec_shift <va, offset>
2474 Create: va = vop <va, va'>
2475 } */
2476
2477 if (vect_print_dump_info (REPORT_DETAILS))
2478 fprintf (vect_dump, "Reduce using vector shifts");
2479
2480 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2481 new_temp = PHI_RESULT (new_phi);
2482
2483 for (bit_offset = vec_size_in_bits/2;
2484 bit_offset >= element_bitsize;
2485 bit_offset /= 2)
2486 {
2487 tree bitpos = size_int (bit_offset);
2488 tree tmp = build2 (shift_code, vectype, new_temp, bitpos);
2489 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2490 new_name = make_ssa_name (vec_dest, epilog_stmt);
2491 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2492 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2493
2494 tmp = build2 (code, vectype, new_name, new_temp);
2495 epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp);
2496 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2497 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2498 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2499 }
2500
2501 extract_scalar_result = true;
2502 }
2503 else
2504 {
2505 tree rhs;
2506
2507 /*** Case 3: Create:
2508 s = extract_field <v_out2, 0>
2509 for (offset = element_size;
2510 offset < vector_size;
2511 offset += element_size;)
2512 {
2513 Create: s' = extract_field <v_out2, offset>
2514 Create: s = op <s, s'>
2515 } */
2516
2517 if (vect_print_dump_info (REPORT_DETAILS))
2518 fprintf (vect_dump, "Reduce using scalar code. ");
2519
2520 vec_temp = PHI_RESULT (new_phi);
2521 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2522 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2523 bitsize_zero_node);
2524 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2525 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2526 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2527 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2528 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2529
2530 for (bit_offset = element_bitsize;
2531 bit_offset < vec_size_in_bits;
2532 bit_offset += element_bitsize)
2533 {
2534 tree tmp;
2535 tree bitpos = bitsize_int (bit_offset);
2536 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2537 bitpos);
2538
2539 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2540 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2541 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2542 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name;
2543 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2544
2545 tmp = build2 (code, scalar_type, new_name, new_temp);
2546 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp);
2547 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2548 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2549 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2550 }
2551
2552 extract_scalar_result = false;
2553 }
2554 }
2555
2556 /* 2.4 Extract the final scalar result. Create:
2557 s_out3 = extract_field <v_out2, bitpos> */
2558
2559 if (extract_scalar_result)
2560 {
2561 tree rhs;
2562
2563 gcc_assert (!nested_in_vect_loop);
2564 if (vect_print_dump_info (REPORT_DETAILS))
2565 fprintf (vect_dump, "extract scalar result");
2566
2567 if (BYTES_BIG_ENDIAN)
2568 bitpos = size_binop (MULT_EXPR,
2569 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2570 TYPE_SIZE (scalar_type));
2571 else
2572 bitpos = bitsize_zero_node;
2573
2574 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
2575 BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
2576 epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs);
2577 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2578 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2579 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2580 }
2581
2582 vect_finalize_reduction:
2583
2584 /* 2.5 Adjust the final result by the initial value of the reduction
2585 variable. (When such adjustment is not needed, then
2586 'adjustment_def' is zero). For example, if code is PLUS we create:
2587 new_temp = loop_exit_def + adjustment_def */
2588
2589 if (adjustment_def)
2590 {
2591 if (nested_in_vect_loop)
2592 {
2593 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
2594 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
2595 new_dest = vect_create_destination_var (scalar_dest, vectype);
2596 }
2597 else
2598 {
2599 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
2600 expr = build2 (code, scalar_type, new_temp, adjustment_def);
2601 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
2602 }
2603 epilog_stmt = build_gimple_modify_stmt (new_dest, expr);
2604 new_temp = make_ssa_name (new_dest, epilog_stmt);
2605 GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp;
2606 bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT);
2607 }
2608
2609
2610 /* 2.6 Handle the loop-exit phi */
2611
2612 /* Replace uses of s_out0 with uses of s_out3:
2613 Find the loop-closed-use at the loop exit of the original scalar result.
2614 (The reduction result is expected to have two immediate uses - one at the
2615 latch block, and one at the loop exit). */
2616 phis = VEC_alloc (tree, heap, 10);
2617 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
2618 {
2619 if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p))))
2620 {
2621 exit_phi = USE_STMT (use_p);
2622 VEC_quick_push (tree, phis, exit_phi);
2623 }
2624 }
2625 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
2626 gcc_assert (!VEC_empty (tree, phis));
2627
2628 for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++)
2629 {
2630 if (nested_in_vect_loop)
2631 {
2632 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2633
2634 /* FORNOW. Currently not supporting the case that an inner-loop reduction
2635 is not used in the outer-loop (but only outside the outer-loop). */
2636 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2637 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2638
2639 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
2640 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
2641 set_stmt_info (get_stmt_ann (epilog_stmt),
2642 new_stmt_vec_info (epilog_stmt, loop_vinfo));
2643 continue;
2644 }
2645
2646 /* Replace the uses: */
2647 orig_name = PHI_RESULT (exit_phi);
2648 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
2649 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2650 SET_USE (use_p, new_temp);
2651 }
2652 VEC_free (tree, heap, phis);
2653 }
2654
2655
2656 /* Function vectorizable_reduction.
2657
2658 Check if STMT performs a reduction operation that can be vectorized.
2659 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
2660 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
2661 Return FALSE if not a vectorizable STMT, TRUE otherwise.
2662
2663 This function also handles reduction idioms (patterns) that have been
2664 recognized in advance during vect_pattern_recog. In this case, STMT may be
2665 of this form:
2666 X = pattern_expr (arg0, arg1, ..., X)
2667 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
2668 sequence that had been detected and replaced by the pattern-stmt (STMT).
2669
2670 In some cases of reduction patterns, the type of the reduction variable X is
2671 different than the type of the other arguments of STMT.
2672 In such cases, the vectype that is used when transforming STMT into a vector
2673 stmt is different than the vectype that is used to determine the
2674 vectorization factor, because it consists of a different number of elements
2675 than the actual number of elements that are being operated upon in parallel.
2676
2677 For example, consider an accumulation of shorts into an int accumulator.
2678 On some targets it's possible to vectorize this pattern operating on 8
2679 shorts at a time (hence, the vectype for purposes of determining the
2680 vectorization factor should be V8HI); on the other hand, the vectype that
2681 is used to create the vector form is actually V4SI (the type of the result).
2682
2683 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
2684 indicates what is the actual level of parallelism (V8HI in the example), so
2685 that the right vectorization factor would be derived. This vectype
2686 corresponds to the type of arguments to the reduction stmt, and should *NOT*
2687 be used to create the vectorized stmt. The right vectype for the vectorized
2688 stmt is obtained from the type of the result X:
2689 get_vectype_for_scalar_type (TREE_TYPE (X))
2690
2691 This means that, contrary to "regular" reductions (or "regular" stmts in
2692 general), the following equation:
2693 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
2694 does *NOT* necessarily hold for reduction patterns. */
2695
2696 bool
2697 vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
2698 {
2699 tree vec_dest;
2700 tree scalar_dest;
2701 tree op;
2702 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
2703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2704 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2705 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2706 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2707 tree operation;
2708 enum tree_code code, orig_code, epilog_reduc_code = 0;
2709 enum machine_mode vec_mode;
2710 int op_type;
2711 optab optab, reduc_optab;
2712 tree new_temp = NULL_TREE;
2713 tree def, def_stmt;
2714 enum vect_def_type dt;
2715 tree new_phi;
2716 tree scalar_type;
2717 bool is_simple_use;
2718 tree orig_stmt;
2719 stmt_vec_info orig_stmt_info;
2720 tree expr = NULL_TREE;
2721 int i;
2722 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
2723 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
2724 stmt_vec_info prev_stmt_info;
2725 tree reduc_def;
2726 tree new_stmt = NULL_TREE;
2727 int j;
2728
2729 if (nested_in_vect_loop_p (loop, stmt))
2730 {
2731 loop = loop->inner;
2732 /* FORNOW. This restriction should be relaxed. */
2733 if (ncopies > 1)
2734 {
2735 if (vect_print_dump_info (REPORT_DETAILS))
2736 fprintf (vect_dump, "multiple types in nested loop.");
2737 return false;
2738 }
2739 }
2740
2741 gcc_assert (ncopies >= 1);
2742
2743 /* FORNOW: SLP not supported. */
2744 if (STMT_SLP_TYPE (stmt_info))
2745 return false;
2746
2747 /* 1. Is vectorizable reduction? */
2748
2749 /* Not supportable if the reduction variable is used in the loop. */
2750 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
2751 return false;
2752
2753 /* Reductions that are not used even in an enclosing outer-loop,
2754 are expected to be "live" (used out of the loop). */
2755 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop
2756 && !STMT_VINFO_LIVE_P (stmt_info))
2757 return false;
2758
2759 /* Make sure it was already recognized as a reduction computation. */
2760 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2761 return false;
2762
2763 /* 2. Has this been recognized as a reduction pattern?
2764
2765 Check if STMT represents a pattern that has been recognized
2766 in earlier analysis stages. For stmts that represent a pattern,
2767 the STMT_VINFO_RELATED_STMT field records the last stmt in
2768 the original sequence that constitutes the pattern. */
2769
2770 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2771 if (orig_stmt)
2772 {
2773 orig_stmt_info = vinfo_for_stmt (orig_stmt);
2774 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
2775 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
2776 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
2777 }
2778
2779 /* 3. Check the operands of the operation. The first operands are defined
2780 inside the loop body. The last operand is the reduction variable,
2781 which is defined by the loop-header-phi. */
2782
2783 gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT);
2784
2785 operation = GIMPLE_STMT_OPERAND (stmt, 1);
2786 code = TREE_CODE (operation);
2787 op_type = TREE_OPERAND_LENGTH (operation);
2788 if (op_type != binary_op && op_type != ternary_op)
2789 return false;
2790 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
2791 scalar_type = TREE_TYPE (scalar_dest);
2792 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
2793 && !SCALAR_FLOAT_TYPE_P (scalar_type))
2794 return false;
2795
2796 /* All uses but the last are expected to be defined in the loop.
2797 The last use is the reduction variable. */
2798 for (i = 0; i < op_type-1; i++)
2799 {
2800 op = TREE_OPERAND (operation, i);
2801 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2802 gcc_assert (is_simple_use);
2803 if (dt != vect_loop_def
2804 && dt != vect_invariant_def
2805 && dt != vect_constant_def
2806 && dt != vect_induction_def)
2807 return false;
2808 }
2809
2810 op = TREE_OPERAND (operation, i);
2811 is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
2812 gcc_assert (is_simple_use);
2813 gcc_assert (dt == vect_reduction_def);
2814 gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
2815 if (orig_stmt)
2816 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2817 else
2818 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt));
2819
2820 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
2821 return false;
2822
2823 /* 4. Supportable by target? */
2824
2825 /* 4.1. check support for the operation in the loop */
2826 optab = optab_for_tree_code (code, vectype);
2827 if (!optab)
2828 {
2829 if (vect_print_dump_info (REPORT_DETAILS))
2830 fprintf (vect_dump, "no optab.");
2831 return false;
2832 }
2833 vec_mode = TYPE_MODE (vectype);
2834 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
2835 {
2836 if (vect_print_dump_info (REPORT_DETAILS))
2837 fprintf (vect_dump, "op not supported by target.");
2838 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
2839 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2840 < vect_min_worthwhile_factor (code))
2841 return false;
2842 if (vect_print_dump_info (REPORT_DETAILS))
2843 fprintf (vect_dump, "proceeding using word mode.");
2844 }
2845
2846 /* Worthwhile without SIMD support? */
2847 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
2848 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2849 < vect_min_worthwhile_factor (code))
2850 {
2851 if (vect_print_dump_info (REPORT_DETAILS))
2852 fprintf (vect_dump, "not worthwhile without SIMD support.");
2853 return false;
2854 }
2855
2856 /* 4.2. Check support for the epilog operation.
2857
2858 If STMT represents a reduction pattern, then the type of the
2859 reduction variable may be different than the type of the rest
2860 of the arguments. For example, consider the case of accumulation
2861 of shorts into an int accumulator; The original code:
2862 S1: int_a = (int) short_a;
2863 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
2864
2865 was replaced with:
2866 STMT: int_acc = widen_sum <short_a, int_acc>
2867
2868 This means that:
2869 1. The tree-code that is used to create the vector operation in the
2870 epilog code (that reduces the partial results) is not the
2871 tree-code of STMT, but is rather the tree-code of the original
2872 stmt from the pattern that STMT is replacing. I.e, in the example
2873 above we want to use 'widen_sum' in the loop, but 'plus' in the
2874 epilog.
2875 2. The type (mode) we use to check available target support
2876 for the vector operation to be created in the *epilog*, is
2877 determined by the type of the reduction variable (in the example
2878 above we'd check this: plus_optab[vect_int_mode]).
2879 However the type (mode) we use to check available target support
2880 for the vector operation to be created *inside the loop*, is
2881 determined by the type of the other arguments to STMT (in the
2882 example we'd check this: widen_sum_optab[vect_short_mode]).
2883
2884 This is contrary to "regular" reductions, in which the types of all
2885 the arguments are the same as the type of the reduction variable.
2886 For "regular" reductions we can therefore use the same vector type
2887 (and also the same tree-code) when generating the epilog code and
2888 when generating the code inside the loop. */
2889
2890 if (orig_stmt)
2891 {
2892 /* This is a reduction pattern: get the vectype from the type of the
2893 reduction variable, and get the tree-code from orig_stmt. */
2894 orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1));
2895 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
2896 if (!vectype)
2897 {
2898 if (vect_print_dump_info (REPORT_DETAILS))
2899 {
2900 fprintf (vect_dump, "unsupported data-type ");
2901 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
2902 }
2903 return false;
2904 }
2905
2906 vec_mode = TYPE_MODE (vectype);
2907 }
2908 else
2909 {
2910 /* Regular reduction: use the same vectype and tree-code as used for
2911 the vector code inside the loop can be used for the epilog code. */
2912 orig_code = code;
2913 }
2914
2915 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
2916 return false;
2917 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
2918 if (!reduc_optab)
2919 {
2920 if (vect_print_dump_info (REPORT_DETAILS))
2921 fprintf (vect_dump, "no optab for reduction.");
2922 epilog_reduc_code = NUM_TREE_CODES;
2923 }
2924 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
2925 {
2926 if (vect_print_dump_info (REPORT_DETAILS))
2927 fprintf (vect_dump, "reduc op not supported by target.");
2928 epilog_reduc_code = NUM_TREE_CODES;
2929 }
2930
2931 if (!vec_stmt) /* transformation not required. */
2932 {
2933 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
2934 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
2935 return false;
2936 return true;
2937 }
2938
2939 /** Transform. **/
2940
2941 if (vect_print_dump_info (REPORT_DETAILS))
2942 fprintf (vect_dump, "transform reduction.");
2943
2944 /* Create the destination vector */
2945 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2946
2947 /* Create the reduction-phi that defines the reduction-operand. */
2948 new_phi = create_phi_node (vec_dest, loop->header);
2949
2950 /* In case the vectorization factor (VF) is bigger than the number
2951 of elements that we can fit in a vectype (nunits), we have to generate
2952 more than one vector stmt - i.e - we need to "unroll" the
2953 vector stmt by a factor VF/nunits. For more details see documentation
2954 in vectorizable_operation. */
2955
2956 prev_stmt_info = NULL;
2957 for (j = 0; j < ncopies; j++)
2958 {
2959 /* Handle uses. */
2960 if (j == 0)
2961 {
2962 op = TREE_OPERAND (operation, 0);
2963 loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
2964 if (op_type == ternary_op)
2965 {
2966 op = TREE_OPERAND (operation, 1);
2967 loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
2968 }
2969
2970 /* Get the vector def for the reduction variable from the phi node */
2971 reduc_def = PHI_RESULT (new_phi);
2972 }
2973 else
2974 {
2975 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
2976 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
2977 if (op_type == ternary_op)
2978 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
2979
2980 /* Get the vector def for the reduction variable from the vectorized
2981 reduction operation generated in the previous iteration (j-1) */
2982 reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0);
2983 }
2984
2985 /* Arguments are ready. create the new vector stmt. */
2986 if (op_type == binary_op)
2987 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
2988 else
2989 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
2990 reduc_def);
2991 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
2992 new_temp = make_ssa_name (vec_dest, new_stmt);
2993 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
2994 vect_finish_stmt_generation (stmt, new_stmt, bsi);
2995
2996 if (j == 0)
2997 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
2998 else
2999 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3000 prev_stmt_info = vinfo_for_stmt (new_stmt);
3001 }
3002
3003 /* Finalize the reduction-phi (set it's arguments) and create the
3004 epilog reduction code. */
3005 vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
3006 return true;
3007 }
3008
3009 /* Checks if CALL can be vectorized in type VECTYPE. Returns
3010 a function declaration if the target has a vectorized version
3011 of the function, or NULL_TREE if the function cannot be vectorized. */
3012
3013 tree
3014 vectorizable_function (tree call, tree vectype_out, tree vectype_in)
3015 {
3016 tree fndecl = get_callee_fndecl (call);
3017 enum built_in_function code;
3018
3019 /* We only handle functions that do not read or clobber memory -- i.e.
3020 const or novops ones. */
3021 if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS)))
3022 return NULL_TREE;
3023
3024 if (!fndecl
3025 || TREE_CODE (fndecl) != FUNCTION_DECL
3026 || !DECL_BUILT_IN (fndecl))
3027 return NULL_TREE;
3028
3029 code = DECL_FUNCTION_CODE (fndecl);
3030 return targetm.vectorize.builtin_vectorized_function (code, vectype_out,
3031 vectype_in);
3032 }
3033
3034 /* Function vectorizable_call.
3035
3036 Check if STMT performs a function call that can be vectorized.
3037 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3038 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3039 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3040
3041 bool
3042 vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
3043 {
3044 tree vec_dest;
3045 tree scalar_dest;
3046 tree operation;
3047 tree op, type;
3048 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3049 stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
3050 tree vectype_out, vectype_in;
3051 int nunits_in;
3052 int nunits_out;
3053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3054 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3055 tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
3056 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3057 tree new_stmt;
3058 int ncopies, j, nargs;
3059 call_expr_arg_iterator iter;
3060 tree vargs;
3061 enum { NARROW, NONE, WIDEN } modifier;
3062
3063 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3064 return false;
3065
3066 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3067 return false;
3068
3069 /* FORNOW: SLP not supported. */
3070 if (STMT_SLP_TYPE (stmt_info))
3071 return false;
3072
3073 /* Is STMT a vectorizable call? */
3074 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3075 return false;
3076
3077 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3078 return false;
3079
3080 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3081 if (TREE_CODE (operation) != CALL_EXPR)
3082 return false;
3083
3084 /* Process function arguments. */
3085 rhs_type = NULL_TREE;
3086 nargs = 0;
3087 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3088 {
3089 /* Bail out if the function has more than two arguments, we
3090 do not have interesting builtin functions to vectorize with
3091 more than two arguments. */
3092 if (nargs >= 2)
3093 return false;
3094
3095 /* We can only handle calls with arguments of the same type. */
3096 if (rhs_type
3097 && rhs_type != TREE_TYPE (op))
3098 {
3099 if (vect_print_dump_info (REPORT_DETAILS))
3100 fprintf (vect_dump, "argument types differ.");
3101 return false;
3102 }
3103 rhs_type = TREE_TYPE (op);
3104
3105 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
3106 {
3107 if (vect_print_dump_info (REPORT_DETAILS))
3108 fprintf (vect_dump, "use not simple.");
3109 return false;
3110 }
3111
3112 ++nargs;
3113 }
3114
3115 /* No arguments is also not good. */
3116 if (nargs == 0)
3117 return false;
3118
3119 vectype_in = get_vectype_for_scalar_type (rhs_type);
3120 if (!vectype_in)
3121 return false;
3122 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3123
3124 lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
3125 vectype_out = get_vectype_for_scalar_type (lhs_type);
3126 if (!vectype_out)
3127 return false;
3128 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3129
3130 /* FORNOW */
3131 if (nunits_in == nunits_out / 2)
3132 modifier = NARROW;
3133 else if (nunits_out == nunits_in)
3134 modifier = NONE;
3135 else if (nunits_out == nunits_in / 2)
3136 modifier = WIDEN;
3137 else
3138 return false;
3139
3140 /* For now, we only vectorize functions if a target specific builtin
3141 is available. TODO -- in some cases, it might be profitable to
3142 insert the calls for pieces of the vector, in order to be able
3143 to vectorize other operations in the loop. */
3144 fndecl = vectorizable_function (operation, vectype_out, vectype_in);
3145 if (fndecl == NULL_TREE)
3146 {
3147 if (vect_print_dump_info (REPORT_DETAILS))
3148 fprintf (vect_dump, "function is not vectorizable.");
3149
3150 return false;
3151 }
3152
3153 gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
3154
3155 if (modifier == NARROW)
3156 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3157 else
3158 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3159
3160 /* Sanity check: make sure that at least one copy of the vectorized stmt
3161 needs to be generated. */
3162 gcc_assert (ncopies >= 1);
3163
3164 /* FORNOW. This restriction should be relaxed. */
3165 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3166 {
3167 if (vect_print_dump_info (REPORT_DETAILS))
3168 fprintf (vect_dump, "multiple types in nested loop.");
3169 return false;
3170 }
3171
3172 if (!vec_stmt) /* transformation not required. */
3173 {
3174 STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
3175 if (vect_print_dump_info (REPORT_DETAILS))
3176 fprintf (vect_dump, "=== vectorizable_call ===");
3177 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3178 return true;
3179 }
3180
3181 /** Transform. **/
3182
3183 if (vect_print_dump_info (REPORT_DETAILS))
3184 fprintf (vect_dump, "transform operation.");
3185
3186 /* FORNOW. This restriction should be relaxed. */
3187 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3188 {
3189 if (vect_print_dump_info (REPORT_DETAILS))
3190 fprintf (vect_dump, "multiple types in nested loop.");
3191 return false;
3192 }
3193
3194 /* Handle def. */
3195 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3196 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3197
3198 prev_stmt_info = NULL;
3199 switch (modifier)
3200 {
3201 case NONE:
3202 for (j = 0; j < ncopies; ++j)
3203 {
3204 /* Build argument list for the vectorized call. */
3205 /* FIXME: Rewrite this so that it doesn't
3206 construct a temporary list. */
3207 vargs = NULL_TREE;
3208 nargs = 0;
3209 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3210 {
3211 if (j == 0)
3212 vec_oprnd0
3213 = vect_get_vec_def_for_operand (op, stmt, NULL);
3214 else
3215 vec_oprnd0
3216 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3217
3218 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3219
3220 ++nargs;
3221 }
3222 vargs = nreverse (vargs);
3223
3224 rhs = build_function_call_expr (fndecl, vargs);
3225 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3226 new_temp = make_ssa_name (vec_dest, new_stmt);
3227 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3228
3229 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3230
3231 if (j == 0)
3232 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3233 else
3234 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3235
3236 prev_stmt_info = vinfo_for_stmt (new_stmt);
3237 }
3238
3239 break;
3240
3241 case NARROW:
3242 for (j = 0; j < ncopies; ++j)
3243 {
3244 /* Build argument list for the vectorized call. */
3245 /* FIXME: Rewrite this so that it doesn't
3246 construct a temporary list. */
3247 vargs = NULL_TREE;
3248 nargs = 0;
3249 FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
3250 {
3251 if (j == 0)
3252 {
3253 vec_oprnd0
3254 = vect_get_vec_def_for_operand (op, stmt, NULL);
3255 vec_oprnd1
3256 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3257 }
3258 else
3259 {
3260 vec_oprnd0
3261 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
3262 vec_oprnd1
3263 = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
3264 }
3265
3266 vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
3267 vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
3268
3269 ++nargs;
3270 }
3271 vargs = nreverse (vargs);
3272
3273 rhs = build_function_call_expr (fndecl, vargs);
3274 new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
3275 new_temp = make_ssa_name (vec_dest, new_stmt);
3276 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3277
3278 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3279
3280 if (j == 0)
3281 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3282 else
3283 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3284
3285 prev_stmt_info = vinfo_for_stmt (new_stmt);
3286 }
3287
3288 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3289
3290 break;
3291
3292 case WIDEN:
3293 /* No current target implements this case. */
3294 return false;
3295 }
3296
3297 /* The call in STMT might prevent it from being removed in dce.
3298 We however cannot remove it here, due to the way the ssa name
3299 it defines is mapped to the new definition. So just replace
3300 rhs of the statement with something harmless. */
3301 type = TREE_TYPE (scalar_dest);
3302 GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
3303 update_stmt (stmt);
3304
3305 return true;
3306 }
3307
3308
3309 /* Function vect_gen_widened_results_half
3310
3311 Create a vector stmt whose code, type, number of arguments, and result
3312 variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are
3313 VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI.
3314 In the case that CODE is a CALL_EXPR, this means that a call to DECL
3315 needs to be created (DECL is a function-decl of a target-builtin).
3316 STMT is the original scalar stmt that we are vectorizing. */
3317
3318 static tree
3319 vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
3320 tree vec_oprnd0, tree vec_oprnd1, int op_type,
3321 tree vec_dest, block_stmt_iterator *bsi,
3322 tree stmt)
3323 {
3324 tree expr;
3325 tree new_stmt;
3326 tree new_temp;
3327 tree sym;
3328 ssa_op_iter iter;
3329
3330 /* Generate half of the widened result: */
3331 if (code == CALL_EXPR)
3332 {
3333 /* Target specific support */
3334 if (op_type == binary_op)
3335 expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1);
3336 else
3337 expr = build_call_expr (decl, 1, vec_oprnd0);
3338 }
3339 else
3340 {
3341 /* Generic support */
3342 gcc_assert (op_type == TREE_CODE_LENGTH (code));
3343 if (op_type == binary_op)
3344 expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1);
3345 else
3346 expr = build1 (code, vectype, vec_oprnd0);
3347 }
3348 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3349 new_temp = make_ssa_name (vec_dest, new_stmt);
3350 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3351 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3352
3353 if (code == CALL_EXPR)
3354 {
3355 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
3356 {
3357 if (TREE_CODE (sym) == SSA_NAME)
3358 sym = SSA_NAME_VAR (sym);
3359 mark_sym_for_renaming (sym);
3360 }
3361 }
3362
3363 return new_stmt;
3364 }
3365
3366
3367 /* Check if STMT performs a conversion operation, that can be vectorized.
3368 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3369 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3370 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3371
3372 bool
3373 vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
3374 tree *vec_stmt, slp_tree slp_node)
3375 {
3376 tree vec_dest;
3377 tree scalar_dest;
3378 tree operation;
3379 tree op0;
3380 tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
3381 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3382 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3383 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3384 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
3385 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
3386 tree new_temp;
3387 tree def, def_stmt;
3388 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3389 tree new_stmt = NULL_TREE;
3390 stmt_vec_info prev_stmt_info;
3391 int nunits_in;
3392 int nunits_out;
3393 tree vectype_out, vectype_in;
3394 int ncopies, j;
3395 tree expr;
3396 tree rhs_type, lhs_type;
3397 tree builtin_decl;
3398 enum { NARROW, NONE, WIDEN } modifier;
3399 int i;
3400 VEC(tree,heap) *vec_oprnds0 = NULL;
3401 tree vop0;
3402
3403 /* Is STMT a vectorizable conversion? */
3404
3405 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3406 return false;
3407
3408 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3409 return false;
3410
3411 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3412 return false;
3413
3414 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3415 return false;
3416
3417 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3418 code = TREE_CODE (operation);
3419 if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
3420 return false;
3421
3422 /* Check types of lhs and rhs. */
3423 op0 = TREE_OPERAND (operation, 0);
3424 rhs_type = TREE_TYPE (op0);
3425 vectype_in = get_vectype_for_scalar_type (rhs_type);
3426 if (!vectype_in)
3427 return false;
3428 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3429
3430 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3431 lhs_type = TREE_TYPE (scalar_dest);
3432 vectype_out = get_vectype_for_scalar_type (lhs_type);
3433 if (!vectype_out)
3434 return false;
3435 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3436
3437 /* FORNOW */
3438 if (nunits_in == nunits_out / 2)
3439 modifier = NARROW;
3440 else if (nunits_out == nunits_in)
3441 modifier = NONE;
3442 else if (nunits_out == nunits_in / 2)
3443 modifier = WIDEN;
3444 else
3445 return false;
3446
3447 if (modifier == NONE)
3448 gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
3449
3450 /* Bail out if the types are both integral or non-integral. */
3451 if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
3452 || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
3453 return false;
3454
3455 if (modifier == NARROW)
3456 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
3457 else
3458 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3459
3460 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3461 this, so we can safely override NCOPIES with 1 here. */
3462 if (slp_node)
3463 ncopies = 1;
3464
3465 /* Sanity check: make sure that at least one copy of the vectorized stmt
3466 needs to be generated. */
3467 gcc_assert (ncopies >= 1);
3468
3469 /* FORNOW. This restriction should be relaxed. */
3470 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3471 {
3472 if (vect_print_dump_info (REPORT_DETAILS))
3473 fprintf (vect_dump, "multiple types in nested loop.");
3474 return false;
3475 }
3476
3477 /* Check the operands of the operation. */
3478 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3479 {
3480 if (vect_print_dump_info (REPORT_DETAILS))
3481 fprintf (vect_dump, "use not simple.");
3482 return false;
3483 }
3484
3485 /* Supportable by target? */
3486 if ((modifier == NONE
3487 && !targetm.vectorize.builtin_conversion (code, vectype_in))
3488 || (modifier == WIDEN
3489 && !supportable_widening_operation (code, stmt, vectype_in,
3490 &decl1, &decl2,
3491 &code1, &code2))
3492 || (modifier == NARROW
3493 && !supportable_narrowing_operation (code, stmt, vectype_in,
3494 &code1)))
3495 {
3496 if (vect_print_dump_info (REPORT_DETAILS))
3497 fprintf (vect_dump, "op not supported by target.");
3498 return false;
3499 }
3500
3501 if (modifier != NONE)
3502 {
3503 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3504 /* FORNOW: SLP not supported. */
3505 if (STMT_SLP_TYPE (stmt_info))
3506 return false;
3507 }
3508
3509 if (!vec_stmt) /* transformation not required. */
3510 {
3511 STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type;
3512 return true;
3513 }
3514
3515 /** Transform. **/
3516 if (vect_print_dump_info (REPORT_DETAILS))
3517 fprintf (vect_dump, "transform conversion.");
3518
3519 /* Handle def. */
3520 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3521
3522 if (modifier == NONE && !slp_node)
3523 vec_oprnds0 = VEC_alloc (tree, heap, 1);
3524
3525 prev_stmt_info = NULL;
3526 switch (modifier)
3527 {
3528 case NONE:
3529 for (j = 0; j < ncopies; j++)
3530 {
3531 tree sym;
3532 ssa_op_iter iter;
3533
3534 if (j == 0)
3535 vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
3536 else
3537 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
3538
3539 builtin_decl =
3540 targetm.vectorize.builtin_conversion (code, vectype_in);
3541 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
3542 {
3543 new_stmt = build_call_expr (builtin_decl, 1, vop0);
3544
3545 /* Arguments are ready. create the new vector stmt. */
3546 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
3547 new_temp = make_ssa_name (vec_dest, new_stmt);
3548 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3549 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3550 FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
3551 SSA_OP_ALL_VIRTUALS)
3552 {
3553 if (TREE_CODE (sym) == SSA_NAME)
3554 sym = SSA_NAME_VAR (sym);
3555 mark_sym_for_renaming (sym);
3556 }
3557 if (slp_node)
3558 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
3559 }
3560
3561 if (j == 0)
3562 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3563 else
3564 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3565 prev_stmt_info = vinfo_for_stmt (new_stmt);
3566 }
3567 break;
3568
3569 case WIDEN:
3570 /* In case the vectorization factor (VF) is bigger than the number
3571 of elements that we can fit in a vectype (nunits), we have to
3572 generate more than one vector stmt - i.e - we need to "unroll"
3573 the vector stmt by a factor VF/nunits. */
3574 for (j = 0; j < ncopies; j++)
3575 {
3576 if (j == 0)
3577 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3578 else
3579 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3580
3581 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
3582
3583 /* Generate first half of the widened result: */
3584 new_stmt
3585 = vect_gen_widened_results_half (code1, vectype_out, decl1,
3586 vec_oprnd0, vec_oprnd1,
3587 unary_op, vec_dest, bsi, stmt);
3588 if (j == 0)
3589 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3590 else
3591 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3592 prev_stmt_info = vinfo_for_stmt (new_stmt);
3593
3594 /* Generate second half of the widened result: */
3595 new_stmt
3596 = vect_gen_widened_results_half (code2, vectype_out, decl2,
3597 vec_oprnd0, vec_oprnd1,
3598 unary_op, vec_dest, bsi, stmt);
3599 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3600 prev_stmt_info = vinfo_for_stmt (new_stmt);
3601 }
3602 break;
3603
3604 case NARROW:
3605 /* In case the vectorization factor (VF) is bigger than the number
3606 of elements that we can fit in a vectype (nunits), we have to
3607 generate more than one vector stmt - i.e - we need to "unroll"
3608 the vector stmt by a factor VF/nunits. */
3609 for (j = 0; j < ncopies; j++)
3610 {
3611 /* Handle uses. */
3612 if (j == 0)
3613 {
3614 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
3615 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3616 }
3617 else
3618 {
3619 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
3620 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
3621 }
3622
3623 /* Arguments are ready. Create the new vector stmt. */
3624 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
3625 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
3626 new_temp = make_ssa_name (vec_dest, new_stmt);
3627 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
3628 vect_finish_stmt_generation (stmt, new_stmt, bsi);
3629
3630 if (j == 0)
3631 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
3632 else
3633 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3634
3635 prev_stmt_info = vinfo_for_stmt (new_stmt);
3636 }
3637
3638 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
3639 }
3640
3641 return true;
3642 }
3643
3644
3645 /* Function vectorizable_assignment.
3646
3647 Check if STMT performs an assignment (copy) that can be vectorized.
3648 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3649 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3650 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3651
3652 bool
3653 vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3654 slp_tree slp_node)
3655 {
3656 tree vec_dest;
3657 tree scalar_dest;
3658 tree op;
3659 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3660 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3661 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3662 tree new_temp;
3663 tree def, def_stmt;
3664 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3665 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3666 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3667 int i;
3668 VEC(tree,heap) *vec_oprnds = NULL;
3669 tree vop;
3670
3671 gcc_assert (ncopies >= 1);
3672 if (ncopies > 1)
3673 return false; /* FORNOW */
3674
3675 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3676 return false;
3677
3678 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3679 return false;
3680
3681 /* Is vectorizable assignment? */
3682 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3683 return false;
3684
3685 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3686 if (TREE_CODE (scalar_dest) != SSA_NAME)
3687 return false;
3688
3689 op = GIMPLE_STMT_OPERAND (stmt, 1);
3690 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0]))
3691 {
3692 if (vect_print_dump_info (REPORT_DETAILS))
3693 fprintf (vect_dump, "use not simple.");
3694 return false;
3695 }
3696
3697 if (!vec_stmt) /* transformation not required. */
3698 {
3699 STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
3700 if (vect_print_dump_info (REPORT_DETAILS))
3701 fprintf (vect_dump, "=== vectorizable_assignment ===");
3702 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3703 return true;
3704 }
3705
3706 /** Transform. **/
3707 if (vect_print_dump_info (REPORT_DETAILS))
3708 fprintf (vect_dump, "transform assignment.");
3709
3710 /* Handle def. */
3711 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3712
3713 /* Handle use. */
3714 vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
3715
3716 /* Arguments are ready. create the new vector stmt. */
3717 for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
3718 {
3719 *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
3720 new_temp = make_ssa_name (vec_dest, *vec_stmt);
3721 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
3722 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
3723 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
3724
3725 if (slp_node)
3726 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
3727 }
3728
3729 VEC_free (tree, heap, vec_oprnds);
3730 return true;
3731 }
3732
3733
3734 /* Function vect_min_worthwhile_factor.
3735
3736 For a loop where we could vectorize the operation indicated by CODE,
3737 return the minimum vectorization factor that makes it worthwhile
3738 to use generic vectors. */
3739 static int
3740 vect_min_worthwhile_factor (enum tree_code code)
3741 {
3742 switch (code)
3743 {
3744 case PLUS_EXPR:
3745 case MINUS_EXPR:
3746 case NEGATE_EXPR:
3747 return 4;
3748
3749 case BIT_AND_EXPR:
3750 case BIT_IOR_EXPR:
3751 case BIT_XOR_EXPR:
3752 case BIT_NOT_EXPR:
3753 return 2;
3754
3755 default:
3756 return INT_MAX;
3757 }
3758 }
3759
3760
3761 /* Function vectorizable_induction
3762
3763 Check if PHI performs an induction computation that can be vectorized.
3764 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3765 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3766 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3767
3768 bool
3769 vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
3770 tree *vec_stmt)
3771 {
3772 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3775 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3776 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3777 tree vec_def;
3778
3779 gcc_assert (ncopies >= 1);
3780
3781 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3782 return false;
3783
3784 /* FORNOW: SLP not supported. */
3785 if (STMT_SLP_TYPE (stmt_info))
3786 return false;
3787
3788 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3789
3790 if (TREE_CODE (phi) != PHI_NODE)
3791 return false;
3792
3793 if (!vec_stmt) /* transformation not required. */
3794 {
3795 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3796 if (vect_print_dump_info (REPORT_DETAILS))
3797 fprintf (vect_dump, "=== vectorizable_induction ===");
3798 vect_model_induction_cost (stmt_info, ncopies);
3799 return true;
3800 }
3801
3802 /** Transform. **/
3803
3804 if (vect_print_dump_info (REPORT_DETAILS))
3805 fprintf (vect_dump, "transform induction phi.");
3806
3807 vec_def = get_initial_def_for_induction (phi);
3808 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3809 return true;
3810 }
3811
3812
3813 /* Function vectorizable_operation.
3814
3815 Check if STMT performs a binary or unary operation that can be vectorized.
3816 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3817 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
3818 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3819
3820 bool
3821 vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
3822 slp_tree slp_node)
3823 {
3824 tree vec_dest;
3825 tree scalar_dest;
3826 tree operation;
3827 tree op0, op1 = NULL;
3828 tree vec_oprnd1 = NULL_TREE;
3829 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3830 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3832 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3833 enum tree_code code;
3834 enum machine_mode vec_mode;
3835 tree new_temp;
3836 int op_type;
3837 optab optab;
3838 int icode;
3839 enum machine_mode optab_op2_mode;
3840 tree def, def_stmt;
3841 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
3842 tree new_stmt = NULL_TREE;
3843 stmt_vec_info prev_stmt_info;
3844 int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
3845 int nunits_out;
3846 tree vectype_out;
3847 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
3848 int j, i;
3849 VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
3850 tree vop0, vop1;
3851 unsigned int k;
3852 bool scalar_shift_arg = false;
3853
3854 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
3855 this, so we can safely override NCOPIES with 1 here. */
3856 if (slp_node)
3857 ncopies = 1;
3858 gcc_assert (ncopies >= 1);
3859 /* FORNOW. This restriction should be relaxed. */
3860 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
3861 {
3862 if (vect_print_dump_info (REPORT_DETAILS))
3863 fprintf (vect_dump, "multiple types in nested loop.");
3864 return false;
3865 }
3866
3867 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3868 return false;
3869
3870 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
3871 return false;
3872
3873 /* Is STMT a vectorizable binary/unary operation? */
3874 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
3875 return false;
3876
3877 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
3878 return false;
3879
3880 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
3881 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
3882 if (!vectype_out)
3883 return false;
3884 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3885 if (nunits_out != nunits_in)
3886 return false;
3887
3888 operation = GIMPLE_STMT_OPERAND (stmt, 1);
3889 code = TREE_CODE (operation);
3890
3891 /* For pointer addition, we should use the normal plus for
3892 the vector addition. */
3893 if (code == POINTER_PLUS_EXPR)
3894 code = PLUS_EXPR;
3895
3896 optab = optab_for_tree_code (code, vectype);
3897
3898 /* Support only unary or binary operations. */
3899 op_type = TREE_OPERAND_LENGTH (operation);
3900 if (op_type != unary_op && op_type != binary_op)
3901 {
3902 if (vect_print_dump_info (REPORT_DETAILS))
3903 fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type);
3904 return false;
3905 }
3906
3907 op0 = TREE_OPERAND (operation, 0);
3908 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
3909 {
3910 if (vect_print_dump_info (REPORT_DETAILS))
3911 fprintf (vect_dump, "use not simple.");
3912 return false;
3913 }
3914
3915 if (op_type == binary_op)
3916 {
3917 op1 = TREE_OPERAND (operation, 1);
3918 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
3919 {
3920 if (vect_print_dump_info (REPORT_DETAILS))
3921 fprintf (vect_dump, "use not simple.");
3922 return false;
3923 }
3924 }
3925
3926 /* Supportable by target? */
3927 if (!optab)
3928 {
3929 if (vect_print_dump_info (REPORT_DETAILS))
3930 fprintf (vect_dump, "no optab.");
3931 return false;
3932 }
3933 vec_mode = TYPE_MODE (vectype);
3934 icode = (int) optab_handler (optab, vec_mode)->insn_code;
3935 if (icode == CODE_FOR_nothing)
3936 {
3937 if (vect_print_dump_info (REPORT_DETAILS))
3938 fprintf (vect_dump, "op not supported by target.");
3939 /* Check only during analysis. */
3940 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3941 || (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3942 < vect_min_worthwhile_factor (code)
3943 && !vec_stmt))
3944 return false;
3945 if (vect_print_dump_info (REPORT_DETAILS))
3946 fprintf (vect_dump, "proceeding using word mode.");
3947 }
3948
3949 /* Worthwhile without SIMD support? Check only during analysis. */
3950 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3951 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3952 < vect_min_worthwhile_factor (code)
3953 && !vec_stmt)
3954 {
3955 if (vect_print_dump_info (REPORT_DETAILS))
3956 fprintf (vect_dump, "not worthwhile without SIMD support.");
3957 return false;
3958 }
3959
3960 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
3961 {
3962 /* FORNOW: not yet supported. */
3963 if (!VECTOR_MODE_P (vec_mode))
3964 return false;
3965
3966 /* Invariant argument is needed for a vector shift
3967 by a scalar shift operand. */
3968 optab_op2_mode = insn_data[icode].operand[2].mode;
3969 if (!VECTOR_MODE_P (optab_op2_mode))
3970 {
3971 if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
3972 {
3973 if (vect_print_dump_info (REPORT_DETAILS))
3974 fprintf (vect_dump, "operand mode requires invariant"
3975 " argument.");
3976 return false;
3977 }
3978
3979 scalar_shift_arg = true;
3980 }
3981 }
3982
3983 if (!vec_stmt) /* transformation not required. */
3984 {
3985 STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
3986 if (vect_print_dump_info (REPORT_DETAILS))
3987 fprintf (vect_dump, "=== vectorizable_operation ===");
3988 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
3989 return true;
3990 }
3991
3992 /** Transform. **/
3993
3994 if (vect_print_dump_info (REPORT_DETAILS))
3995 fprintf (vect_dump, "transform binary/unary operation.");
3996
3997 /* Handle def. */
3998 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3999
4000 /* Allocate VECs for vector operands. In case of SLP, vector operands are
4001 created in the previous stages of the recursion, so no allocation is
4002 needed, except for the case of shift with scalar shift argument. In that
4003 case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to
4004 be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE.
4005 In case of loop-based vectorization we allocate VECs of size 1. We
4006 allocate VEC_OPRNDS1 only in case of binary operation. */
4007 if (!slp_node)
4008 {
4009 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4010 if (op_type == binary_op)
4011 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4012 }
4013 else if (scalar_shift_arg)
4014 vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size);
4015
4016 /* In case the vectorization factor (VF) is bigger than the number
4017 of elements that we can fit in a vectype (nunits), we have to generate
4018 more than one vector stmt - i.e - we need to "unroll" the
4019 vector stmt by a factor VF/nunits. In doing so, we record a pointer
4020 from one copy of the vector stmt to the next, in the field
4021 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
4022 stages to find the correct vector defs to be used when vectorizing
4023 stmts that use the defs of the current stmt. The example below illustrates
4024 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
4025 4 vectorized stmts):
4026
4027 before vectorization:
4028 RELATED_STMT VEC_STMT
4029 S1: x = memref - -
4030 S2: z = x + 1 - -
4031
4032 step 1: vectorize stmt S1 (done in vectorizable_load. See more details
4033 there):
4034 RELATED_STMT VEC_STMT
4035 VS1_0: vx0 = memref0 VS1_1 -
4036 VS1_1: vx1 = memref1 VS1_2 -
4037 VS1_2: vx2 = memref2 VS1_3 -
4038 VS1_3: vx3 = memref3 - -
4039 S1: x = load - VS1_0
4040 S2: z = x + 1 - -
4041
4042 step2: vectorize stmt S2 (done here):
4043 To vectorize stmt S2 we first need to find the relevant vector
4044 def for the first operand 'x'. This is, as usual, obtained from
4045 the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt
4046 that defines 'x' (S1). This way we find the stmt VS1_0, and the
4047 relevant vector def 'vx0'. Having found 'vx0' we can generate
4048 the vector stmt VS2_0, and as usual, record it in the
4049 STMT_VINFO_VEC_STMT of stmt S2.
4050 When creating the second copy (VS2_1), we obtain the relevant vector
4051 def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of
4052 stmt VS1_0. This way we find the stmt VS1_1 and the relevant
4053 vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a
4054 pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0.
4055 Similarly when creating stmts VS2_2 and VS2_3. This is the resulting
4056 chain of stmts and pointers:
4057 RELATED_STMT VEC_STMT
4058 VS1_0: vx0 = memref0 VS1_1 -
4059 VS1_1: vx1 = memref1 VS1_2 -
4060 VS1_2: vx2 = memref2 VS1_3 -
4061 VS1_3: vx3 = memref3 - -
4062 S1: x = load - VS1_0
4063 VS2_0: vz0 = vx0 + v1 VS2_1 -
4064 VS2_1: vz1 = vx1 + v1 VS2_2 -
4065 VS2_2: vz2 = vx2 + v1 VS2_3 -
4066 VS2_3: vz3 = vx3 + v1 - -
4067 S2: z = x + 1 - VS2_0 */
4068
4069 prev_stmt_info = NULL;
4070 for (j = 0; j < ncopies; j++)
4071 {
4072 /* Handle uses. */
4073 if (j == 0)
4074 {
4075 if (op_type == binary_op
4076 && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
4077 {
4078 /* Vector shl and shr insn patterns can be defined with scalar
4079 operand 2 (shift operand). In this case, use constant or loop
4080 invariant op1 directly, without extending it to vector mode
4081 first. */
4082 optab_op2_mode = insn_data[icode].operand[2].mode;
4083 if (!VECTOR_MODE_P (optab_op2_mode))
4084 {
4085 if (vect_print_dump_info (REPORT_DETAILS))
4086 fprintf (vect_dump, "operand 1 using scalar mode.");
4087 vec_oprnd1 = op1;
4088 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4089 if (slp_node)
4090 {
4091 /* Store vec_oprnd1 for every vector stmt to be created
4092 for SLP_NODE. We check during the analysis that all the
4093 shift arguments are the same.
4094 TODO: Allow different constants for different vector
4095 stmts generated for an SLP instance. */
4096 for (k = 0; k < slp_node->vec_stmts_size - 1; k++)
4097 VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
4098 }
4099 }
4100 }
4101
4102 /* vec_oprnd1 is available if operand 1 should be of a scalar-type
4103 (a special case for certain kind of vector shifts); otherwise,
4104 operand 1 should be of a vector type (the usual case). */
4105 if (op_type == binary_op && !vec_oprnd1)
4106 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4107 slp_node);
4108 else
4109 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL,
4110 slp_node);
4111 }
4112 else
4113 vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
4114
4115 /* Arguments are ready. Create the new vector stmt. */
4116 for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
4117 {
4118 if (op_type == binary_op)
4119 {
4120 vop1 = VEC_index (tree, vec_oprnds1, i);
4121 new_stmt = build_gimple_modify_stmt (vec_dest,
4122 build2 (code, vectype, vop0, vop1));
4123 }
4124 else
4125 new_stmt = build_gimple_modify_stmt (vec_dest,
4126 build1 (code, vectype, vop0));
4127
4128 new_temp = make_ssa_name (vec_dest, new_stmt);
4129 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4130 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4131 if (slp_node)
4132 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4133 }
4134
4135 if (j == 0)
4136 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4137 else
4138 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4139 prev_stmt_info = vinfo_for_stmt (new_stmt);
4140 }
4141
4142 VEC_free (tree, heap, vec_oprnds0);
4143 if (vec_oprnds1)
4144 VEC_free (tree, heap, vec_oprnds1);
4145
4146 return true;
4147 }
4148
4149
4150 /* Function vectorizable_type_demotion
4151
4152 Check if STMT performs a binary or unary operation that involves
4153 type demotion, and if it can be vectorized.
4154 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4155 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4156 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4157
4158 bool
4159 vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
4160 tree *vec_stmt)
4161 {
4162 tree vec_dest;
4163 tree scalar_dest;
4164 tree operation;
4165 tree op0;
4166 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4167 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4168 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4170 enum tree_code code, code1 = ERROR_MARK;
4171 tree new_temp;
4172 tree def, def_stmt;
4173 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4174 tree new_stmt;
4175 stmt_vec_info prev_stmt_info;
4176 int nunits_in;
4177 int nunits_out;
4178 tree vectype_out;
4179 int ncopies;
4180 int j;
4181 tree expr;
4182 tree vectype_in;
4183
4184 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4185 return false;
4186
4187 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4188 return false;
4189
4190 /* Is STMT a vectorizable type-demotion operation? */
4191 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4192 return false;
4193
4194 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4195 return false;
4196
4197 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4198 code = TREE_CODE (operation);
4199 if (code != NOP_EXPR && code != CONVERT_EXPR)
4200 return false;
4201
4202 op0 = TREE_OPERAND (operation, 0);
4203 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4204 if (!vectype_in)
4205 return false;
4206 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4207
4208 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4209 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4210 if (!vectype_out)
4211 return false;
4212 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4213 if (nunits_in != nunits_out / 2) /* FORNOW */
4214 return false;
4215
4216 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
4217 gcc_assert (ncopies >= 1);
4218 /* FORNOW. This restriction should be relaxed. */
4219 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4220 {
4221 if (vect_print_dump_info (REPORT_DETAILS))
4222 fprintf (vect_dump, "multiple types in nested loop.");
4223 return false;
4224 }
4225
4226 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4227 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4228 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4229 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4230 && (code == NOP_EXPR || code == CONVERT_EXPR))))
4231 return false;
4232
4233 /* Check the operands of the operation. */
4234 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4235 {
4236 if (vect_print_dump_info (REPORT_DETAILS))
4237 fprintf (vect_dump, "use not simple.");
4238 return false;
4239 }
4240
4241 /* Supportable by target? */
4242 if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1))
4243 return false;
4244
4245 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4246
4247 if (!vec_stmt) /* transformation not required. */
4248 {
4249 STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
4250 if (vect_print_dump_info (REPORT_DETAILS))
4251 fprintf (vect_dump, "=== vectorizable_demotion ===");
4252 vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
4253 return true;
4254 }
4255
4256 /** Transform. **/
4257 if (vect_print_dump_info (REPORT_DETAILS))
4258 fprintf (vect_dump, "transform type demotion operation. ncopies = %d.",
4259 ncopies);
4260
4261 /* Handle def. */
4262 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4263
4264 /* In case the vectorization factor (VF) is bigger than the number
4265 of elements that we can fit in a vectype (nunits), we have to generate
4266 more than one vector stmt - i.e - we need to "unroll" the
4267 vector stmt by a factor VF/nunits. */
4268 prev_stmt_info = NULL;
4269 for (j = 0; j < ncopies; j++)
4270 {
4271 /* Handle uses. */
4272 if (j == 0)
4273 {
4274 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4275 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4276 }
4277 else
4278 {
4279 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
4280 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4281 }
4282
4283 /* Arguments are ready. Create the new vector stmt. */
4284 expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1);
4285 new_stmt = build_gimple_modify_stmt (vec_dest, expr);
4286 new_temp = make_ssa_name (vec_dest, new_stmt);
4287 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
4288 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4289
4290 if (j == 0)
4291 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4292 else
4293 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4294
4295 prev_stmt_info = vinfo_for_stmt (new_stmt);
4296 }
4297
4298 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4299 return true;
4300 }
4301
4302
4303 /* Function vectorizable_type_promotion
4304
4305 Check if STMT performs a binary or unary operation that involves
4306 type promotion, and if it can be vectorized.
4307 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4308 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4309 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4310
4311 bool
4312 vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
4313 tree *vec_stmt)
4314 {
4315 tree vec_dest;
4316 tree scalar_dest;
4317 tree operation;
4318 tree op0, op1 = NULL;
4319 tree vec_oprnd0=NULL, vec_oprnd1=NULL;
4320 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4321 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4322 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4323 enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
4324 tree decl1 = NULL_TREE, decl2 = NULL_TREE;
4325 int op_type;
4326 tree def, def_stmt;
4327 enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
4328 tree new_stmt;
4329 stmt_vec_info prev_stmt_info;
4330 int nunits_in;
4331 int nunits_out;
4332 tree vectype_out;
4333 int ncopies;
4334 int j;
4335 tree vectype_in;
4336
4337 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4338 return false;
4339
4340 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4341 return false;
4342
4343 /* Is STMT a vectorizable type-promotion operation? */
4344 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4345 return false;
4346
4347 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
4348 return false;
4349
4350 operation = GIMPLE_STMT_OPERAND (stmt, 1);
4351 code = TREE_CODE (operation);
4352 if (code != NOP_EXPR && code != CONVERT_EXPR
4353 && code != WIDEN_MULT_EXPR)
4354 return false;
4355
4356 op0 = TREE_OPERAND (operation, 0);
4357 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0));
4358 if (!vectype_in)
4359 return false;
4360 nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
4361
4362 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4363 vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest));
4364 if (!vectype_out)
4365 return false;
4366 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
4367 if (nunits_out != nunits_in / 2) /* FORNOW */
4368 return false;
4369
4370 ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
4371 gcc_assert (ncopies >= 1);
4372 /* FORNOW. This restriction should be relaxed. */
4373 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4374 {
4375 if (vect_print_dump_info (REPORT_DETAILS))
4376 fprintf (vect_dump, "multiple types in nested loop.");
4377 return false;
4378 }
4379
4380 if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
4381 && INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4382 || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest))
4383 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0))
4384 && (code == CONVERT_EXPR || code == NOP_EXPR))))
4385 return false;
4386
4387 /* Check the operands of the operation. */
4388 if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
4389 {
4390 if (vect_print_dump_info (REPORT_DETAILS))
4391 fprintf (vect_dump, "use not simple.");
4392 return false;
4393 }
4394
4395 op_type = TREE_CODE_LENGTH (code);
4396 if (op_type == binary_op)
4397 {
4398 op1 = TREE_OPERAND (operation, 1);
4399 if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1]))
4400 {
4401 if (vect_print_dump_info (REPORT_DETAILS))
4402 fprintf (vect_dump, "use not simple.");
4403 return false;
4404 }
4405 }
4406
4407 /* Supportable by target? */
4408 if (!supportable_widening_operation (code, stmt, vectype_in,
4409 &decl1, &decl2, &code1, &code2))
4410 return false;
4411
4412 STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
4413
4414 if (!vec_stmt) /* transformation not required. */
4415 {
4416 STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
4417 if (vect_print_dump_info (REPORT_DETAILS))
4418 fprintf (vect_dump, "=== vectorizable_promotion ===");
4419 vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
4420 return true;
4421 }
4422
4423 /** Transform. **/
4424
4425 if (vect_print_dump_info (REPORT_DETAILS))
4426 fprintf (vect_dump, "transform type promotion operation. ncopies = %d.",
4427 ncopies);
4428
4429 /* Handle def. */
4430 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4431
4432 /* In case the vectorization factor (VF) is bigger than the number
4433 of elements that we can fit in a vectype (nunits), we have to generate
4434 more than one vector stmt - i.e - we need to "unroll" the
4435 vector stmt by a factor VF/nunits. */
4436
4437 prev_stmt_info = NULL;
4438 for (j = 0; j < ncopies; j++)
4439 {
4440 /* Handle uses. */
4441 if (j == 0)
4442 {
4443 vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
4444 if (op_type == binary_op)
4445 vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
4446 }
4447 else
4448 {
4449 vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
4450 if (op_type == binary_op)
4451 vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
4452 }
4453
4454 /* Arguments are ready. Create the new vector stmt. We are creating
4455 two vector defs because the widened result does not fit in one vector.
4456 The vectorized stmt can be expressed as a call to a taregt builtin,
4457 or a using a tree-code. */
4458 /* Generate first half of the widened result: */
4459 new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1,
4460 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4461 if (j == 0)
4462 STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
4463 else
4464 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4465 prev_stmt_info = vinfo_for_stmt (new_stmt);
4466
4467 /* Generate second half of the widened result: */
4468 new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2,
4469 vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt);
4470 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4471 prev_stmt_info = vinfo_for_stmt (new_stmt);
4472
4473 }
4474
4475 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4476 return true;
4477 }
4478
4479
4480 /* Function vect_strided_store_supported.
4481
4482 Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported,
4483 and FALSE otherwise. */
4484
4485 static bool
4486 vect_strided_store_supported (tree vectype)
4487 {
4488 optab interleave_high_optab, interleave_low_optab;
4489 int mode;
4490
4491 mode = (int) TYPE_MODE (vectype);
4492
4493 /* Check that the operation is supported. */
4494 interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
4495 vectype);
4496 interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
4497 vectype);
4498 if (!interleave_high_optab || !interleave_low_optab)
4499 {
4500 if (vect_print_dump_info (REPORT_DETAILS))
4501 fprintf (vect_dump, "no optab for interleave.");
4502 return false;
4503 }
4504
4505 if (optab_handler (interleave_high_optab, mode)->insn_code
4506 == CODE_FOR_nothing
4507 || optab_handler (interleave_low_optab, mode)->insn_code
4508 == CODE_FOR_nothing)
4509 {
4510 if (vect_print_dump_info (REPORT_DETAILS))
4511 fprintf (vect_dump, "interleave op not supported by target.");
4512 return false;
4513 }
4514
4515 return true;
4516 }
4517
4518
4519 /* Function vect_permute_store_chain.
4520
4521 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4522 a power of 2, generate interleave_high/low stmts to reorder the data
4523 correctly for the stores. Return the final references for stores in
4524 RESULT_CHAIN.
4525
4526 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4527 The input is 4 vectors each containing 8 elements. We assign a number to each
4528 element, the input sequence is:
4529
4530 1st vec: 0 1 2 3 4 5 6 7
4531 2nd vec: 8 9 10 11 12 13 14 15
4532 3rd vec: 16 17 18 19 20 21 22 23
4533 4th vec: 24 25 26 27 28 29 30 31
4534
4535 The output sequence should be:
4536
4537 1st vec: 0 8 16 24 1 9 17 25
4538 2nd vec: 2 10 18 26 3 11 19 27
4539 3rd vec: 4 12 20 28 5 13 21 30
4540 4th vec: 6 14 22 30 7 15 23 31
4541
4542 i.e., we interleave the contents of the four vectors in their order.
4543
4544 We use interleave_high/low instructions to create such output. The input of
4545 each interleave_high/low operation is two vectors:
4546 1st vec 2nd vec
4547 0 1 2 3 4 5 6 7
4548 the even elements of the result vector are obtained left-to-right from the
4549 high/low elements of the first vector. The odd elements of the result are
4550 obtained left-to-right from the high/low elements of the second vector.
4551 The output of interleave_high will be: 0 4 1 5
4552 and of interleave_low: 2 6 3 7
4553
4554
4555 The permutation is done in log LENGTH stages. In each stage interleave_high
4556 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4557 where the first argument is taken from the first half of DR_CHAIN and the
4558 second argument from it's second half.
4559 In our example,
4560
4561 I1: interleave_high (1st vec, 3rd vec)
4562 I2: interleave_low (1st vec, 3rd vec)
4563 I3: interleave_high (2nd vec, 4th vec)
4564 I4: interleave_low (2nd vec, 4th vec)
4565
4566 The output for the first stage is:
4567
4568 I1: 0 16 1 17 2 18 3 19
4569 I2: 4 20 5 21 6 22 7 23
4570 I3: 8 24 9 25 10 26 11 27
4571 I4: 12 28 13 29 14 30 15 31
4572
4573 The output of the second stage, i.e. the final result is:
4574
4575 I1: 0 8 16 24 1 9 17 25
4576 I2: 2 10 18 26 3 11 19 27
4577 I3: 4 12 20 28 5 13 21 30
4578 I4: 6 14 22 30 7 15 23 31. */
4579
4580 static bool
4581 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4582 unsigned int length,
4583 tree stmt,
4584 block_stmt_iterator *bsi,
4585 VEC(tree,heap) **result_chain)
4586 {
4587 tree perm_dest, perm_stmt, vect1, vect2, high, low;
4588 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4589 tree scalar_dest, tmp;
4590 int i;
4591 unsigned int j;
4592 VEC(tree,heap) *first, *second;
4593
4594 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4595 first = VEC_alloc (tree, heap, length/2);
4596 second = VEC_alloc (tree, heap, length/2);
4597
4598 /* Check that the operation is supported. */
4599 if (!vect_strided_store_supported (vectype))
4600 return false;
4601
4602 *result_chain = VEC_copy (tree, heap, dr_chain);
4603
4604 for (i = 0; i < exact_log2 (length); i++)
4605 {
4606 for (j = 0; j < length/2; j++)
4607 {
4608 vect1 = VEC_index (tree, dr_chain, j);
4609 vect2 = VEC_index (tree, dr_chain, j+length/2);
4610
4611 /* Create interleaving stmt:
4612 in the case of big endian:
4613 high = interleave_high (vect1, vect2)
4614 and in the case of little endian:
4615 high = interleave_low (vect1, vect2). */
4616 perm_dest = create_tmp_var (vectype, "vect_inter_high");
4617 DECL_GIMPLE_REG_P (perm_dest) = 1;
4618 add_referenced_var (perm_dest);
4619 if (BYTES_BIG_ENDIAN)
4620 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4621 else
4622 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4623 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4624 high = make_ssa_name (perm_dest, perm_stmt);
4625 GIMPLE_STMT_OPERAND (perm_stmt, 0) = high;
4626 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4627 VEC_replace (tree, *result_chain, 2*j, high);
4628
4629 /* Create interleaving stmt:
4630 in the case of big endian:
4631 low = interleave_low (vect1, vect2)
4632 and in the case of little endian:
4633 low = interleave_high (vect1, vect2). */
4634 perm_dest = create_tmp_var (vectype, "vect_inter_low");
4635 DECL_GIMPLE_REG_P (perm_dest) = 1;
4636 add_referenced_var (perm_dest);
4637 if (BYTES_BIG_ENDIAN)
4638 tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2);
4639 else
4640 tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2);
4641 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
4642 low = make_ssa_name (perm_dest, perm_stmt);
4643 GIMPLE_STMT_OPERAND (perm_stmt, 0) = low;
4644 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
4645 VEC_replace (tree, *result_chain, 2*j+1, low);
4646 }
4647 dr_chain = VEC_copy (tree, heap, *result_chain);
4648 }
4649 return true;
4650 }
4651
4652
4653 /* Function vectorizable_store.
4654
4655 Check if STMT defines a non scalar data-ref (array/pointer/structure) that
4656 can be vectorized.
4657 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4658 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
4659 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4660
4661 bool
4662 vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
4663 slp_tree slp_node)
4664 {
4665 tree scalar_dest;
4666 tree data_ref;
4667 tree op;
4668 tree vec_oprnd = NULL_TREE;
4669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4670 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL;
4671 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4672 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4673 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4674 enum machine_mode vec_mode;
4675 tree dummy;
4676 enum dr_alignment_support alignment_support_scheme;
4677 tree def, def_stmt;
4678 enum vect_def_type dt;
4679 stmt_vec_info prev_stmt_info = NULL;
4680 tree dataref_ptr = NULL_TREE;
4681 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4682 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4683 int j;
4684 tree next_stmt, first_stmt = NULL_TREE;
4685 bool strided_store = false;
4686 unsigned int group_size, i;
4687 VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
4688 bool inv_p;
4689 VEC(tree,heap) *vec_oprnds = NULL;
4690 bool slp = (slp_node != NULL);
4691 stmt_vec_info first_stmt_vinfo;
4692 unsigned int vec_num;
4693
4694 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
4695 this, so we can safely override NCOPIES with 1 here. */
4696 if (slp)
4697 ncopies = 1;
4698
4699 gcc_assert (ncopies >= 1);
4700
4701 /* FORNOW. This restriction should be relaxed. */
4702 if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
4703 {
4704 if (vect_print_dump_info (REPORT_DETAILS))
4705 fprintf (vect_dump, "multiple types in nested loop.");
4706 return false;
4707 }
4708
4709 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4710 return false;
4711
4712 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
4713 return false;
4714
4715 /* Is vectorizable store? */
4716
4717 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
4718 return false;
4719
4720 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
4721 if (TREE_CODE (scalar_dest) != ARRAY_REF
4722 && TREE_CODE (scalar_dest) != INDIRECT_REF
4723 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
4724 return false;
4725
4726 op = GIMPLE_STMT_OPERAND (stmt, 1);
4727 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4728 {
4729 if (vect_print_dump_info (REPORT_DETAILS))
4730 fprintf (vect_dump, "use not simple.");
4731 return false;
4732 }
4733
4734 vec_mode = TYPE_MODE (vectype);
4735 /* FORNOW. In some cases can vectorize even if data-type not supported
4736 (e.g. - array initialization with 0). */
4737 if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing)
4738 return false;
4739
4740 if (!STMT_VINFO_DATA_REF (stmt_info))
4741 return false;
4742
4743 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4744 {
4745 strided_store = true;
4746 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
4747 if (!vect_strided_store_supported (vectype)
4748 && !PURE_SLP_STMT (stmt_info) && !slp)
4749 return false;
4750
4751 if (first_stmt == stmt)
4752 {
4753 /* STMT is the leader of the group. Check the operands of all the
4754 stmts of the group. */
4755 next_stmt = DR_GROUP_NEXT_DR (stmt_info);
4756 while (next_stmt)
4757 {
4758 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4759 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
4760 {
4761 if (vect_print_dump_info (REPORT_DETAILS))
4762 fprintf (vect_dump, "use not simple.");
4763 return false;
4764 }
4765 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4766 }
4767 }
4768 }
4769
4770 if (!vec_stmt) /* transformation not required. */
4771 {
4772 STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
4773 if (!PURE_SLP_STMT (stmt_info))
4774 vect_model_store_cost (stmt_info, ncopies, dt, NULL);
4775 return true;
4776 }
4777
4778 /** Transform. **/
4779
4780 if (strided_store)
4781 {
4782 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
4783 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
4784
4785 DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++;
4786
4787 /* FORNOW */
4788 gcc_assert (!nested_in_vect_loop_p (loop, stmt));
4789
4790 /* We vectorize all the stmts of the interleaving group when we
4791 reach the last stmt in the group. */
4792 if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
4793 < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
4794 && !slp)
4795 {
4796 *vec_stmt = NULL_TREE;
4797 return true;
4798 }
4799
4800 if (slp)
4801 strided_store = false;
4802
4803 /* VEC_NUM is the number of vect stmts to be created for this group. */
4804 if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
4805 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4806 else
4807 vec_num = group_size;
4808 }
4809 else
4810 {
4811 first_stmt = stmt;
4812 first_dr = dr;
4813 group_size = vec_num = 1;
4814 first_stmt_vinfo = stmt_info;
4815 }
4816
4817 if (vect_print_dump_info (REPORT_DETAILS))
4818 fprintf (vect_dump, "transform store. ncopies = %d",ncopies);
4819
4820 dr_chain = VEC_alloc (tree, heap, group_size);
4821 oprnds = VEC_alloc (tree, heap, group_size);
4822
4823 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
4824 gcc_assert (alignment_support_scheme);
4825 gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */
4826
4827 /* In case the vectorization factor (VF) is bigger than the number
4828 of elements that we can fit in a vectype (nunits), we have to generate
4829 more than one vector stmt - i.e - we need to "unroll" the
4830 vector stmt by a factor VF/nunits. For more details see documentation in
4831 vect_get_vec_def_for_copy_stmt. */
4832
4833 /* In case of interleaving (non-unit strided access):
4834
4835 S1: &base + 2 = x2
4836 S2: &base = x0
4837 S3: &base + 1 = x1
4838 S4: &base + 3 = x3
4839
4840 We create vectorized stores starting from base address (the access of the
4841 first stmt in the chain (S2 in the above example), when the last store stmt
4842 of the chain (S4) is reached:
4843
4844 VS1: &base = vx2
4845 VS2: &base + vec_size*1 = vx0
4846 VS3: &base + vec_size*2 = vx1
4847 VS4: &base + vec_size*3 = vx3
4848
4849 Then permutation statements are generated:
4850
4851 VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 >
4852 VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 >
4853 ...
4854
4855 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
4856 (the order of the data-refs in the output of vect_permute_store_chain
4857 corresponds to the order of scalar stmts in the interleaving chain - see
4858 the documentation of vect_permute_store_chain()).
4859
4860 In case of both multiple types and interleaving, above vector stores and
4861 permutation stmts are created for every copy. The result vector stmts are
4862 put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
4863 STMT_VINFO_RELATED_STMT for the next copies.
4864 */
4865
4866 prev_stmt_info = NULL;
4867 for (j = 0; j < ncopies; j++)
4868 {
4869 tree new_stmt;
4870 tree ptr_incr;
4871
4872 if (j == 0)
4873 {
4874 if (slp)
4875 {
4876 /* Get vectorized arguments for SLP_NODE. */
4877 vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
4878
4879 vec_oprnd = VEC_index (tree, vec_oprnds, 0);
4880 }
4881 else
4882 {
4883 /* For interleaved stores we collect vectorized defs for all the
4884 stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
4885 used as an input to vect_permute_store_chain(), and OPRNDS as
4886 an input to vect_get_vec_def_for_stmt_copy() for the next copy.
4887
4888 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4889 OPRNDS are of size 1. */
4890 next_stmt = first_stmt;
4891 for (i = 0; i < group_size; i++)
4892 {
4893 /* Since gaps are not supported for interleaved stores,
4894 GROUP_SIZE is the exact number of stmts in the chain.
4895 Therefore, NEXT_STMT can't be NULL_TREE. In case that
4896 there is no interleaving, GROUP_SIZE is 1, and only one
4897 iteration of the loop will be executed. */
4898 gcc_assert (next_stmt);
4899 op = GIMPLE_STMT_OPERAND (next_stmt, 1);
4900
4901 vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
4902 NULL);
4903 VEC_quick_push(tree, dr_chain, vec_oprnd);
4904 VEC_quick_push(tree, oprnds, vec_oprnd);
4905 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4906 }
4907 }
4908 dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
4909 &dummy, &ptr_incr, false,
4910 TREE_TYPE (vec_oprnd), &inv_p);
4911 gcc_assert (!inv_p);
4912 }
4913 else
4914 {
4915 /* FORNOW SLP doesn't work for multiple types. */
4916 gcc_assert (!slp);
4917
4918 /* For interleaved stores we created vectorized defs for all the
4919 defs stored in OPRNDS in the previous iteration (previous copy).
4920 DR_CHAIN is then used as an input to vect_permute_store_chain(),
4921 and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the
4922 next copy.
4923 If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
4924 OPRNDS are of size 1. */
4925 for (i = 0; i < group_size; i++)
4926 {
4927 op = VEC_index (tree, oprnds, i);
4928 vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
4929 vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op);
4930 VEC_replace(tree, dr_chain, i, vec_oprnd);
4931 VEC_replace(tree, oprnds, i, vec_oprnd);
4932 }
4933 dataref_ptr =
4934 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
4935 }
4936
4937 if (strided_store)
4938 {
4939 result_chain = VEC_alloc (tree, heap, group_size);
4940 /* Permute. */
4941 if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi,
4942 &result_chain))
4943 return false;
4944 }
4945
4946 next_stmt = first_stmt;
4947 for (i = 0; i < vec_num; i++)
4948 {
4949 if (i > 0)
4950 /* Bump the vector pointer. */
4951 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
4952 NULL_TREE);
4953
4954 if (slp)
4955 vec_oprnd = VEC_index (tree, vec_oprnds, i);
4956 else if (strided_store)
4957 /* For strided stores vectorized defs are interleaved in
4958 vect_permute_store_chain(). */
4959 vec_oprnd = VEC_index (tree, result_chain, i);
4960
4961 data_ref = build_fold_indirect_ref (dataref_ptr);
4962 /* Arguments are ready. Create the new vector stmt. */
4963 new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd);
4964 vect_finish_stmt_generation (stmt, new_stmt, bsi);
4965 mark_symbols_for_renaming (new_stmt);
4966
4967 if (j == 0)
4968 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4969 else
4970 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4971
4972 prev_stmt_info = vinfo_for_stmt (new_stmt);
4973 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
4974 if (!next_stmt)
4975 break;
4976 }
4977 }
4978
4979 return true;
4980 }
4981
4982
4983 /* Function vect_setup_realignment
4984
4985 This function is called when vectorizing an unaligned load using
4986 the dr_explicit_realign[_optimized] scheme.
4987 This function generates the following code at the loop prolog:
4988
4989 p = initial_addr;
4990 x msq_init = *(floor(p)); # prolog load
4991 realignment_token = call target_builtin;
4992 loop:
4993 x msq = phi (msq_init, ---)
4994
4995 The stmts marked with x are generated only for the case of
4996 dr_explicit_realign_optimized.
4997
4998 The code above sets up a new (vector) pointer, pointing to the first
4999 location accessed by STMT, and a "floor-aligned" load using that pointer.
5000 It also generates code to compute the "realignment-token" (if the relevant
5001 target hook was defined), and creates a phi-node at the loop-header bb
5002 whose arguments are the result of the prolog-load (created by this
5003 function) and the result of a load that takes place in the loop (to be
5004 created by the caller to this function).
5005
5006 For the case of dr_explicit_realign_optimized:
5007 The caller to this function uses the phi-result (msq) to create the
5008 realignment code inside the loop, and sets up the missing phi argument,
5009 as follows:
5010 loop:
5011 msq = phi (msq_init, lsq)
5012 lsq = *(floor(p')); # load in loop
5013 result = realign_load (msq, lsq, realignment_token);
5014
5015 For the case of dr_explicit_realign:
5016 loop:
5017 msq = *(floor(p)); # load in loop
5018 p' = p + (VS-1);
5019 lsq = *(floor(p')); # load in loop
5020 result = realign_load (msq, lsq, realignment_token);
5021
5022 Input:
5023 STMT - (scalar) load stmt to be vectorized. This load accesses
5024 a memory location that may be unaligned.
5025 BSI - place where new code is to be inserted.
5026 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5027 is used.
5028
5029 Output:
5030 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5031 target hook, if defined.
5032 Return value - the result of the loop-header phi node. */
5033
5034 static tree
5035 vect_setup_realignment (tree stmt, block_stmt_iterator *bsi,
5036 tree *realignment_token,
5037 enum dr_alignment_support alignment_support_scheme,
5038 tree init_addr,
5039 struct loop **at_loop)
5040 {
5041 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5042 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5043 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5044 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5045 edge pe;
5046 tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5047 tree vec_dest;
5048 tree inc;
5049 tree ptr;
5050 tree data_ref;
5051 tree new_stmt;
5052 basic_block new_bb;
5053 tree msq_init = NULL_TREE;
5054 tree new_temp;
5055 tree phi_stmt;
5056 tree msq = NULL_TREE;
5057 tree stmts = NULL_TREE;
5058 bool inv_p;
5059 bool compute_in_loop = false;
5060 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5061 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5062 struct loop *loop_for_initial_load;
5063
5064 gcc_assert (alignment_support_scheme == dr_explicit_realign
5065 || alignment_support_scheme == dr_explicit_realign_optimized);
5066
5067 /* We need to generate three things:
5068 1. the misalignment computation
5069 2. the extra vector load (for the optimized realignment scheme).
5070 3. the phi node for the two vectors from which the realignment is
5071 done (for the optimized realignment scheme).
5072 */
5073
5074 /* 1. Determine where to generate the misalignment computation.
5075
5076 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5077 calculation will be generated by this function, outside the loop (in the
5078 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5079 caller, inside the loop.
5080
5081 Background: If the misalignment remains fixed throughout the iterations of
5082 the loop, then both realignment schemes are applicable, and also the
5083 misalignment computation can be done outside LOOP. This is because we are
5084 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5085 are a multiple of VS (the Vector Size), and therefore the misalignment in
5086 different vectorized LOOP iterations is always the same.
5087 The problem arises only if the memory access is in an inner-loop nested
5088 inside LOOP, which is now being vectorized using outer-loop vectorization.
5089 This is the only case when the misalignment of the memory access may not
5090 remain fixed throughout the iterations of the inner-loop (as explained in
5091 detail in vect_supportable_dr_alignment). In this case, not only is the
5092 optimized realignment scheme not applicable, but also the misalignment
5093 computation (and generation of the realignment token that is passed to
5094 REALIGN_LOAD) have to be done inside the loop.
5095
5096 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5097 or not, which in turn determines if the misalignment is computed inside
5098 the inner-loop, or outside LOOP. */
5099
5100 if (init_addr != NULL_TREE)
5101 {
5102 compute_in_loop = true;
5103 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5104 }
5105
5106
5107 /* 2. Determine where to generate the extra vector load.
5108
5109 For the optimized realignment scheme, instead of generating two vector
5110 loads in each iteration, we generate a single extra vector load in the
5111 preheader of the loop, and in each iteration reuse the result of the
5112 vector load from the previous iteration. In case the memory access is in
5113 an inner-loop nested inside LOOP, which is now being vectorized using
5114 outer-loop vectorization, we need to determine whether this initial vector
5115 load should be generated at the preheader of the inner-loop, or can be
5116 generated at the preheader of LOOP. If the memory access has no evolution
5117 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5118 to be generated inside LOOP (in the preheader of the inner-loop). */
5119
5120 if (nested_in_vect_loop)
5121 {
5122 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5123 bool invariant_in_outerloop =
5124 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5125 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5126 }
5127 else
5128 loop_for_initial_load = loop;
5129 if (at_loop)
5130 *at_loop = loop_for_initial_load;
5131
5132 /* 3. For the case of the optimized realignment, create the first vector
5133 load at the loop preheader. */
5134
5135 if (alignment_support_scheme == dr_explicit_realign_optimized)
5136 {
5137 /* Create msq_init = *(floor(p1)) in the loop preheader */
5138
5139 gcc_assert (!compute_in_loop);
5140 pe = loop_preheader_edge (loop_for_initial_load);
5141 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5142 ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE,
5143 &init_addr, &inc, true, NULL_TREE, &inv_p);
5144 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5145 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5146 new_temp = make_ssa_name (vec_dest, new_stmt);
5147 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5148 mark_symbols_for_renaming (new_stmt);
5149 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5150 gcc_assert (!new_bb);
5151 msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0);
5152 }
5153
5154 /* 4. Create realignment token using a target builtin, if available.
5155 It is done either inside the containing loop, or before LOOP (as
5156 determined above). */
5157
5158 if (targetm.vectorize.builtin_mask_for_load)
5159 {
5160 tree builtin_decl;
5161
5162 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5163 if (compute_in_loop)
5164 gcc_assert (init_addr); /* already computed by the caller. */
5165 else
5166 {
5167 /* Generate the INIT_ADDR computation outside LOOP. */
5168 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5169 NULL_TREE, loop);
5170 pe = loop_preheader_edge (loop);
5171 new_bb = bsi_insert_on_edge_immediate (pe, stmts);
5172 gcc_assert (!new_bb);
5173 }
5174
5175 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5176 new_stmt = build_call_expr (builtin_decl, 1, init_addr);
5177 vec_dest = vect_create_destination_var (scalar_dest,
5178 TREE_TYPE (new_stmt));
5179 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5180 new_temp = make_ssa_name (vec_dest, new_stmt);
5181 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5182
5183 if (compute_in_loop)
5184 bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
5185 else
5186 {
5187 /* Generate the misalignment computation outside LOOP. */
5188 pe = loop_preheader_edge (loop);
5189 new_bb = bsi_insert_on_edge_immediate (pe, new_stmt);
5190 gcc_assert (!new_bb);
5191 }
5192
5193 *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0);
5194
5195 /* The result of the CALL_EXPR to this builtin is determined from
5196 the value of the parameter and no global variables are touched
5197 which makes the builtin a "const" function. Requiring the
5198 builtin to have the "const" attribute makes it unnecessary
5199 to call mark_call_clobbered. */
5200 gcc_assert (TREE_READONLY (builtin_decl));
5201 }
5202
5203 if (alignment_support_scheme == dr_explicit_realign)
5204 return msq;
5205
5206 gcc_assert (!compute_in_loop);
5207 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5208
5209
5210 /* 5. Create msq = phi <msq_init, lsq> in loop */
5211
5212 pe = loop_preheader_edge (containing_loop);
5213 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5214 msq = make_ssa_name (vec_dest, NULL_TREE);
5215 phi_stmt = create_phi_node (msq, containing_loop->header);
5216 SSA_NAME_DEF_STMT (msq) = phi_stmt;
5217 add_phi_arg (phi_stmt, msq_init, pe);
5218
5219 return msq;
5220 }
5221
5222
5223 /* Function vect_strided_load_supported.
5224
5225 Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported,
5226 and FALSE otherwise. */
5227
5228 static bool
5229 vect_strided_load_supported (tree vectype)
5230 {
5231 optab perm_even_optab, perm_odd_optab;
5232 int mode;
5233
5234 mode = (int) TYPE_MODE (vectype);
5235
5236 perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
5237 if (!perm_even_optab)
5238 {
5239 if (vect_print_dump_info (REPORT_DETAILS))
5240 fprintf (vect_dump, "no optab for perm_even.");
5241 return false;
5242 }
5243
5244 if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing)
5245 {
5246 if (vect_print_dump_info (REPORT_DETAILS))
5247 fprintf (vect_dump, "perm_even op not supported by target.");
5248 return false;
5249 }
5250
5251 perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
5252 if (!perm_odd_optab)
5253 {
5254 if (vect_print_dump_info (REPORT_DETAILS))
5255 fprintf (vect_dump, "no optab for perm_odd.");
5256 return false;
5257 }
5258
5259 if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing)
5260 {
5261 if (vect_print_dump_info (REPORT_DETAILS))
5262 fprintf (vect_dump, "perm_odd op not supported by target.");
5263 return false;
5264 }
5265 return true;
5266 }
5267
5268
5269 /* Function vect_permute_load_chain.
5270
5271 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5272 a power of 2, generate extract_even/odd stmts to reorder the input data
5273 correctly. Return the final references for loads in RESULT_CHAIN.
5274
5275 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5276 The input is 4 vectors each containing 8 elements. We assign a number to each
5277 element, the input sequence is:
5278
5279 1st vec: 0 1 2 3 4 5 6 7
5280 2nd vec: 8 9 10 11 12 13 14 15
5281 3rd vec: 16 17 18 19 20 21 22 23
5282 4th vec: 24 25 26 27 28 29 30 31
5283
5284 The output sequence should be:
5285
5286 1st vec: 0 4 8 12 16 20 24 28
5287 2nd vec: 1 5 9 13 17 21 25 29
5288 3rd vec: 2 6 10 14 18 22 26 30
5289 4th vec: 3 7 11 15 19 23 27 31
5290
5291 i.e., the first output vector should contain the first elements of each
5292 interleaving group, etc.
5293
5294 We use extract_even/odd instructions to create such output. The input of each
5295 extract_even/odd operation is two vectors
5296 1st vec 2nd vec
5297 0 1 2 3 4 5 6 7
5298
5299 and the output is the vector of extracted even/odd elements. The output of
5300 extract_even will be: 0 2 4 6
5301 and of extract_odd: 1 3 5 7
5302
5303
5304 The permutation is done in log LENGTH stages. In each stage extract_even and
5305 extract_odd stmts are created for each pair of vectors in DR_CHAIN in their
5306 order. In our example,
5307
5308 E1: extract_even (1st vec, 2nd vec)
5309 E2: extract_odd (1st vec, 2nd vec)
5310 E3: extract_even (3rd vec, 4th vec)
5311 E4: extract_odd (3rd vec, 4th vec)
5312
5313 The output for the first stage will be:
5314
5315 E1: 0 2 4 6 8 10 12 14
5316 E2: 1 3 5 7 9 11 13 15
5317 E3: 16 18 20 22 24 26 28 30
5318 E4: 17 19 21 23 25 27 29 31
5319
5320 In order to proceed and create the correct sequence for the next stage (or
5321 for the correct output, if the second stage is the last one, as in our
5322 example), we first put the output of extract_even operation and then the
5323 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5324 The input for the second stage is:
5325
5326 1st vec (E1): 0 2 4 6 8 10 12 14
5327 2nd vec (E3): 16 18 20 22 24 26 28 30
5328 3rd vec (E2): 1 3 5 7 9 11 13 15
5329 4th vec (E4): 17 19 21 23 25 27 29 31
5330
5331 The output of the second stage:
5332
5333 E1: 0 4 8 12 16 20 24 28
5334 E2: 2 6 10 14 18 22 26 30
5335 E3: 1 5 9 13 17 21 25 29
5336 E4: 3 7 11 15 19 23 27 31
5337
5338 And RESULT_CHAIN after reordering:
5339
5340 1st vec (E1): 0 4 8 12 16 20 24 28
5341 2nd vec (E3): 1 5 9 13 17 21 25 29
5342 3rd vec (E2): 2 6 10 14 18 22 26 30
5343 4th vec (E4): 3 7 11 15 19 23 27 31. */
5344
5345 static bool
5346 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
5347 unsigned int length,
5348 tree stmt,
5349 block_stmt_iterator *bsi,
5350 VEC(tree,heap) **result_chain)
5351 {
5352 tree perm_dest, perm_stmt, data_ref, first_vect, second_vect;
5353 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5354 tree tmp;
5355 int i;
5356 unsigned int j;
5357
5358 /* Check that the operation is supported. */
5359 if (!vect_strided_load_supported (vectype))
5360 return false;
5361
5362 *result_chain = VEC_copy (tree, heap, dr_chain);
5363 for (i = 0; i < exact_log2 (length); i++)
5364 {
5365 for (j = 0; j < length; j +=2)
5366 {
5367 first_vect = VEC_index (tree, dr_chain, j);
5368 second_vect = VEC_index (tree, dr_chain, j+1);
5369
5370 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5371 perm_dest = create_tmp_var (vectype, "vect_perm_even");
5372 DECL_GIMPLE_REG_P (perm_dest) = 1;
5373 add_referenced_var (perm_dest);
5374
5375 tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype,
5376 first_vect, second_vect);
5377 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5378
5379 data_ref = make_ssa_name (perm_dest, perm_stmt);
5380 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5381 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5382 mark_symbols_for_renaming (perm_stmt);
5383
5384 VEC_replace (tree, *result_chain, j/2, data_ref);
5385
5386 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5387 perm_dest = create_tmp_var (vectype, "vect_perm_odd");
5388 DECL_GIMPLE_REG_P (perm_dest) = 1;
5389 add_referenced_var (perm_dest);
5390
5391 tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype,
5392 first_vect, second_vect);
5393 perm_stmt = build_gimple_modify_stmt (perm_dest, tmp);
5394 data_ref = make_ssa_name (perm_dest, perm_stmt);
5395 GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref;
5396 vect_finish_stmt_generation (stmt, perm_stmt, bsi);
5397 mark_symbols_for_renaming (perm_stmt);
5398
5399 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
5400 }
5401 dr_chain = VEC_copy (tree, heap, *result_chain);
5402 }
5403 return true;
5404 }
5405
5406
5407 /* Function vect_transform_strided_load.
5408
5409 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5410 to perform their permutation and ascribe the result vectorized statements to
5411 the scalar statements.
5412 */
5413
5414 static bool
5415 vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
5416 block_stmt_iterator *bsi)
5417 {
5418 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5419 tree first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5420 tree next_stmt, new_stmt;
5421 VEC(tree,heap) *result_chain = NULL;
5422 unsigned int i, gap_count;
5423 tree tmp_data_ref;
5424
5425 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5426 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5427 vectors, that are ready for vector computation. */
5428 result_chain = VEC_alloc (tree, heap, size);
5429 /* Permute. */
5430 if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain))
5431 return false;
5432
5433 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5434 Since we scan the chain starting from it's first node, their order
5435 corresponds the order of data-refs in RESULT_CHAIN. */
5436 next_stmt = first_stmt;
5437 gap_count = 1;
5438 for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++)
5439 {
5440 if (!next_stmt)
5441 break;
5442
5443 /* Skip the gaps. Loads created for the gaps will be removed by dead
5444 code elimination pass later.
5445 DR_GROUP_GAP is the number of steps in elements from the previous
5446 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
5447 correspond to the gaps.
5448 */
5449 if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
5450 {
5451 gap_count++;
5452 continue;
5453 }
5454
5455 while (next_stmt)
5456 {
5457 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5458 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5459 copies, and we put the new vector statement in the first available
5460 RELATED_STMT. */
5461 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5462 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5463 else
5464 {
5465 tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5466 tree rel_stmt = STMT_VINFO_RELATED_STMT (
5467 vinfo_for_stmt (prev_stmt));
5468 while (rel_stmt)
5469 {
5470 prev_stmt = rel_stmt;
5471 rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5472 }
5473 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt;
5474 }
5475 next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
5476 gap_count = 1;
5477 /* If NEXT_STMT accesses the same DR as the previous statement,
5478 put the same TMP_DATA_REF as its vectorized statement; otherwise
5479 get the next data-ref from RESULT_CHAIN. */
5480 if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5481 break;
5482 }
5483 }
5484 return true;
5485 }
5486
5487
5488 /* vectorizable_load.
5489
5490 Check if STMT reads a non scalar data-ref (array/pointer/structure) that
5491 can be vectorized.
5492 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5493 stmt to replace it, put it in VEC_STMT, and insert it at BSI.
5494 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5495
5496 bool
5497 vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
5498 slp_tree slp_node)
5499 {
5500 tree scalar_dest;
5501 tree vec_dest = NULL;
5502 tree data_ref = NULL;
5503 tree op;
5504 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5505 stmt_vec_info prev_stmt_info;
5506 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5507 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5508 struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father;
5509 bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5510 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
5511 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5512 tree new_temp;
5513 int mode;
5514 tree new_stmt = NULL_TREE;
5515 tree dummy;
5516 enum dr_alignment_support alignment_support_scheme;
5517 tree dataref_ptr = NULL_TREE;
5518 tree ptr_incr;
5519 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5520 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5521 int i, j, group_size;
5522 tree msq = NULL_TREE, lsq;
5523 tree offset = NULL_TREE;
5524 tree realignment_token = NULL_TREE;
5525 tree phi = NULL_TREE;
5526 VEC(tree,heap) *dr_chain = NULL;
5527 bool strided_load = false;
5528 tree first_stmt;
5529 tree scalar_type;
5530 bool inv_p;
5531 bool compute_in_loop = false;
5532 struct loop *at_loop;
5533 int vec_num;
5534 bool slp = (slp_node != NULL);
5535
5536 /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
5537 this, so we can safely override NCOPIES with 1 here. */
5538 if (slp)
5539 ncopies = 1;
5540
5541 gcc_assert (ncopies >= 1);
5542
5543 /* FORNOW. This restriction should be relaxed. */
5544 if (nested_in_vect_loop && ncopies > 1)
5545 {
5546 if (vect_print_dump_info (REPORT_DETAILS))
5547 fprintf (vect_dump, "multiple types in nested loop.");
5548 return false;
5549 }
5550
5551 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5552 return false;
5553
5554 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
5555 return false;
5556
5557 /* Is vectorizable load? */
5558 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5559 return false;
5560
5561 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
5562 if (TREE_CODE (scalar_dest) != SSA_NAME)
5563 return false;
5564
5565 op = GIMPLE_STMT_OPERAND (stmt, 1);
5566 if (TREE_CODE (op) != ARRAY_REF
5567 && TREE_CODE (op) != INDIRECT_REF
5568 && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
5569 return false;
5570
5571 if (!STMT_VINFO_DATA_REF (stmt_info))
5572 return false;
5573
5574 scalar_type = TREE_TYPE (DR_REF (dr));
5575 mode = (int) TYPE_MODE (vectype);
5576
5577 /* FORNOW. In some cases can vectorize even if data-type not supported
5578 (e.g. - data copies). */
5579 if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing)
5580 {
5581 if (vect_print_dump_info (REPORT_DETAILS))
5582 fprintf (vect_dump, "Aligned load, but unsupported type.");
5583 return false;
5584 }
5585
5586 /* Check if the load is a part of an interleaving chain. */
5587 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5588 {
5589 strided_load = true;
5590 /* FORNOW */
5591 gcc_assert (! nested_in_vect_loop);
5592
5593 /* Check if interleaving is supported. */
5594 if (!vect_strided_load_supported (vectype)
5595 && !PURE_SLP_STMT (stmt_info) && !slp)
5596 return false;
5597 }
5598
5599 if (!vec_stmt) /* transformation not required. */
5600 {
5601 STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
5602 vect_model_load_cost (stmt_info, ncopies, NULL);
5603 return true;
5604 }
5605
5606 if (vect_print_dump_info (REPORT_DETAILS))
5607 fprintf (vect_dump, "transform load.");
5608
5609 /** Transform. **/
5610
5611 if (strided_load)
5612 {
5613 first_stmt = DR_GROUP_FIRST_DR (stmt_info);
5614 /* Check if the chain of loads is already vectorized. */
5615 if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt)))
5616 {
5617 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5618 return true;
5619 }
5620 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
5621 group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
5622 dr_chain = VEC_alloc (tree, heap, group_size);
5623
5624 /* VEC_NUM is the number of vect stmts to be created for this group. */
5625 if (slp)
5626 {
5627 strided_load = false;
5628 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5629 }
5630 else
5631 vec_num = group_size;
5632 }
5633 else
5634 {
5635 first_stmt = stmt;
5636 first_dr = dr;
5637 group_size = vec_num = 1;
5638 }
5639
5640 alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
5641 gcc_assert (alignment_support_scheme);
5642
5643 /* In case the vectorization factor (VF) is bigger than the number
5644 of elements that we can fit in a vectype (nunits), we have to generate
5645 more than one vector stmt - i.e - we need to "unroll" the
5646 vector stmt by a factor VF/nunits. In doing so, we record a pointer
5647 from one copy of the vector stmt to the next, in the field
5648 STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
5649 stages to find the correct vector defs to be used when vectorizing
5650 stmts that use the defs of the current stmt. The example below illustrates
5651 the vectorization process when VF=16 and nunits=4 (i.e - we need to create
5652 4 vectorized stmts):
5653
5654 before vectorization:
5655 RELATED_STMT VEC_STMT
5656 S1: x = memref - -
5657 S2: z = x + 1 - -
5658
5659 step 1: vectorize stmt S1:
5660 We first create the vector stmt VS1_0, and, as usual, record a
5661 pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
5662 Next, we create the vector stmt VS1_1, and record a pointer to
5663 it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
5664 Similarly, for VS1_2 and VS1_3. This is the resulting chain of
5665 stmts and pointers:
5666 RELATED_STMT VEC_STMT
5667 VS1_0: vx0 = memref0 VS1_1 -
5668 VS1_1: vx1 = memref1 VS1_2 -
5669 VS1_2: vx2 = memref2 VS1_3 -
5670 VS1_3: vx3 = memref3 - -
5671 S1: x = load - VS1_0
5672 S2: z = x + 1 - -
5673
5674 See in documentation in vect_get_vec_def_for_stmt_copy for how the
5675 information we recorded in RELATED_STMT field is used to vectorize
5676 stmt S2. */
5677
5678 /* In case of interleaving (non-unit strided access):
5679
5680 S1: x2 = &base + 2
5681 S2: x0 = &base
5682 S3: x1 = &base + 1
5683 S4: x3 = &base + 3
5684
5685 Vectorized loads are created in the order of memory accesses
5686 starting from the access of the first stmt of the chain:
5687
5688 VS1: vx0 = &base
5689 VS2: vx1 = &base + vec_size*1
5690 VS3: vx3 = &base + vec_size*2
5691 VS4: vx4 = &base + vec_size*3
5692
5693 Then permutation statements are generated:
5694
5695 VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 >
5696 VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 >
5697 ...
5698
5699 And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts
5700 (the order of the data-refs in the output of vect_permute_load_chain
5701 corresponds to the order of scalar stmts in the interleaving chain - see
5702 the documentation of vect_permute_load_chain()).
5703 The generation of permutation stmts and recording them in
5704 STMT_VINFO_VEC_STMT is done in vect_transform_strided_load().
5705
5706 In case of both multiple types and interleaving, the vector loads and
5707 permutation stmts above are created for every copy. The result vector stmts
5708 are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding
5709 STMT_VINFO_RELATED_STMT for the next copies. */
5710
5711 /* If the data reference is aligned (dr_aligned) or potentially unaligned
5712 on a target that supports unaligned accesses (dr_unaligned_supported)
5713 we generate the following code:
5714 p = initial_addr;
5715 indx = 0;
5716 loop {
5717 p = p + indx * vectype_size;
5718 vec_dest = *(p);
5719 indx = indx + 1;
5720 }
5721
5722 Otherwise, the data reference is potentially unaligned on a target that
5723 does not support unaligned accesses (dr_explicit_realign_optimized) -
5724 then generate the following code, in which the data in each iteration is
5725 obtained by two vector loads, one from the previous iteration, and one
5726 from the current iteration:
5727 p1 = initial_addr;
5728 msq_init = *(floor(p1))
5729 p2 = initial_addr + VS - 1;
5730 realignment_token = call target_builtin;
5731 indx = 0;
5732 loop {
5733 p2 = p2 + indx * vectype_size
5734 lsq = *(floor(p2))
5735 vec_dest = realign_load (msq, lsq, realignment_token)
5736 indx = indx + 1;
5737 msq = lsq;
5738 } */
5739
5740 /* If the misalignment remains the same throughout the execution of the
5741 loop, we can create the init_addr and permutation mask at the loop
5742 preheader. Otherwise, it needs to be created inside the loop.
5743 This can only occur when vectorizing memory accesses in the inner-loop
5744 nested within an outer-loop that is being vectorized. */
5745
5746 if (nested_in_vect_loop_p (loop, stmt)
5747 && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0))
5748 {
5749 gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
5750 compute_in_loop = true;
5751 }
5752
5753 if ((alignment_support_scheme == dr_explicit_realign_optimized
5754 || alignment_support_scheme == dr_explicit_realign)
5755 && !compute_in_loop)
5756 {
5757 msq = vect_setup_realignment (first_stmt, bsi, &realignment_token,
5758 alignment_support_scheme, NULL_TREE,
5759 &at_loop);
5760 if (alignment_support_scheme == dr_explicit_realign_optimized)
5761 {
5762 phi = SSA_NAME_DEF_STMT (msq);
5763 offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5764 }
5765 }
5766 else
5767 at_loop = loop;
5768
5769 prev_stmt_info = NULL;
5770 for (j = 0; j < ncopies; j++)
5771 {
5772 /* 1. Create the vector pointer update chain. */
5773 if (j == 0)
5774 dataref_ptr = vect_create_data_ref_ptr (first_stmt,
5775 at_loop, offset,
5776 &dummy, &ptr_incr, false,
5777 NULL_TREE, &inv_p);
5778 else
5779 dataref_ptr =
5780 bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
5781
5782 for (i = 0; i < vec_num; i++)
5783 {
5784 if (i > 0)
5785 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
5786 NULL_TREE);
5787
5788 /* 2. Create the vector-load in the loop. */
5789 switch (alignment_support_scheme)
5790 {
5791 case dr_aligned:
5792 gcc_assert (aligned_access_p (first_dr));
5793 data_ref = build_fold_indirect_ref (dataref_ptr);
5794 break;
5795 case dr_unaligned_supported:
5796 {
5797 int mis = DR_MISALIGNMENT (first_dr);
5798 tree tmis = (mis == -1 ? size_zero_node : size_int (mis));
5799
5800 tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT));
5801 data_ref =
5802 build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis);
5803 break;
5804 }
5805 case dr_explicit_realign:
5806 {
5807 tree ptr, bump;
5808 tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
5809
5810 if (compute_in_loop)
5811 msq = vect_setup_realignment (first_stmt, bsi,
5812 &realignment_token,
5813 dr_explicit_realign,
5814 dataref_ptr, NULL);
5815
5816 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5817 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5818 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5819 new_temp = make_ssa_name (vec_dest, new_stmt);
5820 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5821 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5822 copy_virtual_operands (new_stmt, stmt);
5823 mark_symbols_for_renaming (new_stmt);
5824 msq = new_temp;
5825
5826 bump = size_binop (MULT_EXPR, vs_minus_1,
5827 TYPE_SIZE_UNIT (scalar_type));
5828 ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump);
5829 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr);
5830 break;
5831 }
5832 case dr_explicit_realign_optimized:
5833 data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr);
5834 break;
5835 default:
5836 gcc_unreachable ();
5837 }
5838 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5839 new_stmt = build_gimple_modify_stmt (vec_dest, data_ref);
5840 new_temp = make_ssa_name (vec_dest, new_stmt);
5841 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5842 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5843 mark_symbols_for_renaming (new_stmt);
5844
5845 /* 3. Handle explicit realignment if necessary/supported. Create in
5846 loop: vec_dest = realign_load (msq, lsq, realignment_token) */
5847 if (alignment_support_scheme == dr_explicit_realign_optimized
5848 || alignment_support_scheme == dr_explicit_realign)
5849 {
5850 lsq = GIMPLE_STMT_OPERAND (new_stmt, 0);
5851 if (!realignment_token)
5852 realignment_token = dataref_ptr;
5853 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5854 new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq,
5855 realignment_token);
5856 new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
5857 new_temp = make_ssa_name (vec_dest, new_stmt);
5858 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5859 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5860
5861 if (alignment_support_scheme == dr_explicit_realign_optimized)
5862 {
5863 if (i == vec_num - 1 && j == ncopies - 1)
5864 add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
5865 msq = lsq;
5866 }
5867 }
5868
5869 /* 4. Handle invariant-load. */
5870 if (inv_p)
5871 {
5872 gcc_assert (!strided_load);
5873 gcc_assert (nested_in_vect_loop_p (loop, stmt));
5874 if (j == 0)
5875 {
5876 int k;
5877 tree t = NULL_TREE;
5878 tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type);
5879
5880 /* CHECKME: bitpos depends on endianess? */
5881 bitpos = bitsize_zero_node;
5882 vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5883 bitsize, bitpos);
5884 BIT_FIELD_REF_UNSIGNED (vec_inv) =
5885 TYPE_UNSIGNED (scalar_type);
5886 vec_dest =
5887 vect_create_destination_var (scalar_dest, NULL_TREE);
5888 new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv);
5889 new_temp = make_ssa_name (vec_dest, new_stmt);
5890 GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
5891 vect_finish_stmt_generation (stmt, new_stmt, bsi);
5892
5893 for (k = nunits - 1; k >= 0; --k)
5894 t = tree_cons (NULL_TREE, new_temp, t);
5895 /* FIXME: use build_constructor directly. */
5896 vec_inv = build_constructor_from_list (vectype, t);
5897 new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi);
5898 new_stmt = SSA_NAME_DEF_STMT (new_temp);
5899 }
5900 else
5901 gcc_unreachable (); /* FORNOW. */
5902 }
5903
5904 /* Collect vector loads and later create their permutation in
5905 vect_transform_strided_load (). */
5906 if (strided_load)
5907 VEC_quick_push (tree, dr_chain, new_temp);
5908
5909 /* Store vector loads in the corresponding SLP_NODE. */
5910 if (slp)
5911 VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
5912 }
5913
5914 /* FORNOW: SLP with multiple types is unsupported. */
5915 if (slp)
5916 return true;
5917
5918 if (strided_load)
5919 {
5920 if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
5921 return false;
5922 *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
5923 dr_chain = VEC_alloc (tree, heap, group_size);
5924 }
5925 else
5926 {
5927 if (j == 0)
5928 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5929 else
5930 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5931 prev_stmt_info = vinfo_for_stmt (new_stmt);
5932 }
5933 }
5934
5935 return true;
5936 }
5937
5938
5939 /* Function vectorizable_live_operation.
5940
5941 STMT computes a value that is used outside the loop. Check if
5942 it can be supported. */
5943
5944 bool
5945 vectorizable_live_operation (tree stmt,
5946 block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
5947 tree *vec_stmt ATTRIBUTE_UNUSED)
5948 {
5949 tree operation;
5950 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5951 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5952 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5953 int i;
5954 int op_type;
5955 tree op;
5956 tree def, def_stmt;
5957 enum vect_def_type dt;
5958
5959 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5960
5961 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5962 return false;
5963
5964 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
5965 return false;
5966
5967 if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME)
5968 return false;
5969
5970 /* FORNOW. CHECKME. */
5971 if (nested_in_vect_loop_p (loop, stmt))
5972 return false;
5973
5974 operation = GIMPLE_STMT_OPERAND (stmt, 1);
5975 op_type = TREE_OPERAND_LENGTH (operation);
5976
5977 /* FORNOW: support only if all uses are invariant. This means
5978 that the scalar operations can remain in place, unvectorized.
5979 The original last scalar value that they compute will be used. */
5980
5981 for (i = 0; i < op_type; i++)
5982 {
5983 op = TREE_OPERAND (operation, i);
5984 if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt))
5985 {
5986 if (vect_print_dump_info (REPORT_DETAILS))
5987 fprintf (vect_dump, "use not simple.");
5988 return false;
5989 }
5990
5991 if (dt != vect_invariant_def && dt != vect_constant_def)
5992 return false;
5993 }
5994
5995 /* No transformation is required for the cases we currently support. */
5996 return true;
5997 }
5998
5999
6000 /* Function vect_is_simple_cond.
6001
6002 Input:
6003 LOOP - the loop that is being vectorized.
6004 COND - Condition that is checked for simple use.
6005
6006 Returns whether a COND can be vectorized. Checks whether
6007 condition operands are supportable using vec_is_simple_use. */
6008
6009 static bool
6010 vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo)
6011 {
6012 tree lhs, rhs;
6013 tree def;
6014 enum vect_def_type dt;
6015
6016 if (!COMPARISON_CLASS_P (cond))
6017 return false;
6018
6019 lhs = TREE_OPERAND (cond, 0);
6020 rhs = TREE_OPERAND (cond, 1);
6021
6022 if (TREE_CODE (lhs) == SSA_NAME)
6023 {
6024 tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs);
6025 if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt))
6026 return false;
6027 }
6028 else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST
6029 && TREE_CODE (lhs) != FIXED_CST)
6030 return false;
6031
6032 if (TREE_CODE (rhs) == SSA_NAME)
6033 {
6034 tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs);
6035 if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt))
6036 return false;
6037 }
6038 else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST
6039 && TREE_CODE (rhs) != FIXED_CST)
6040 return false;
6041
6042 return true;
6043 }
6044
6045 /* vectorizable_condition.
6046
6047 Check if STMT is conditional modify expression that can be vectorized.
6048 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6049 stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it
6050 at BSI.
6051
6052 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6053
6054 bool
6055 vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
6056 {
6057 tree scalar_dest = NULL_TREE;
6058 tree vec_dest = NULL_TREE;
6059 tree op = NULL_TREE;
6060 tree cond_expr, then_clause, else_clause;
6061 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6062 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6063 tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause;
6064 tree vec_compare, vec_cond_expr;
6065 tree new_temp;
6066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6067 enum machine_mode vec_mode;
6068 tree def;
6069 enum vect_def_type dt;
6070 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6071 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6072
6073 gcc_assert (ncopies >= 1);
6074 if (ncopies > 1)
6075 return false; /* FORNOW */
6076
6077 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6078 return false;
6079
6080 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
6081 return false;
6082
6083 /* FORNOW: SLP not supported. */
6084 if (STMT_SLP_TYPE (stmt_info))
6085 return false;
6086
6087 /* FORNOW: not yet supported. */
6088 if (STMT_VINFO_LIVE_P (stmt_info))
6089 {
6090 if (vect_print_dump_info (REPORT_DETAILS))
6091 fprintf (vect_dump, "value used after loop.");
6092 return false;
6093 }
6094
6095 /* Is vectorizable conditional operation? */
6096 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
6097 return false;
6098
6099 op = GIMPLE_STMT_OPERAND (stmt, 1);
6100
6101 if (TREE_CODE (op) != COND_EXPR)
6102 return false;
6103
6104 cond_expr = TREE_OPERAND (op, 0);
6105 then_clause = TREE_OPERAND (op, 1);
6106 else_clause = TREE_OPERAND (op, 2);
6107
6108 if (!vect_is_simple_cond (cond_expr, loop_vinfo))
6109 return false;
6110
6111 /* We do not handle two different vector types for the condition
6112 and the values. */
6113 if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype))
6114 return false;
6115
6116 if (TREE_CODE (then_clause) == SSA_NAME)
6117 {
6118 tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause);
6119 if (!vect_is_simple_use (then_clause, loop_vinfo,
6120 &then_def_stmt, &def, &dt))
6121 return false;
6122 }
6123 else if (TREE_CODE (then_clause) != INTEGER_CST
6124 && TREE_CODE (then_clause) != REAL_CST
6125 && TREE_CODE (then_clause) != FIXED_CST)
6126 return false;
6127
6128 if (TREE_CODE (else_clause) == SSA_NAME)
6129 {
6130 tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause);
6131 if (!vect_is_simple_use (else_clause, loop_vinfo,
6132 &else_def_stmt, &def, &dt))
6133 return false;
6134 }
6135 else if (TREE_CODE (else_clause) != INTEGER_CST
6136 && TREE_CODE (else_clause) != REAL_CST
6137 && TREE_CODE (else_clause) != FIXED_CST)
6138 return false;
6139
6140
6141 vec_mode = TYPE_MODE (vectype);
6142
6143 if (!vec_stmt)
6144 {
6145 STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
6146 return expand_vec_cond_expr_p (op, vec_mode);
6147 }
6148
6149 /* Transform */
6150
6151 /* Handle def. */
6152 scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
6153 vec_dest = vect_create_destination_var (scalar_dest, vectype);
6154
6155 /* Handle cond expr. */
6156 vec_cond_lhs =
6157 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL);
6158 vec_cond_rhs =
6159 vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL);
6160 vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL);
6161 vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL);
6162
6163 /* Arguments are ready. create the new vector stmt. */
6164 vec_compare = build2 (TREE_CODE (cond_expr), vectype,
6165 vec_cond_lhs, vec_cond_rhs);
6166 vec_cond_expr = build3 (VEC_COND_EXPR, vectype,
6167 vec_compare, vec_then_clause, vec_else_clause);
6168
6169 *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr);
6170 new_temp = make_ssa_name (vec_dest, *vec_stmt);
6171 GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
6172 vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
6173
6174 return true;
6175 }
6176
6177
6178 /* Function vect_transform_stmt.
6179
6180 Create a vectorized stmt to replace STMT, and insert it at BSI. */
6181
6182 static bool
6183 vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
6184 slp_tree slp_node)
6185 {
6186 bool is_store = false;
6187 tree vec_stmt = NULL_TREE;
6188 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6189 tree orig_stmt_in_pattern;
6190 bool done;
6191
6192 switch (STMT_VINFO_TYPE (stmt_info))
6193 {
6194 case type_demotion_vec_info_type:
6195 gcc_assert (!slp_node);
6196 done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
6197 gcc_assert (done);
6198 break;
6199
6200 case type_promotion_vec_info_type:
6201 gcc_assert (!slp_node);
6202 done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
6203 gcc_assert (done);
6204 break;
6205
6206 case type_conversion_vec_info_type:
6207 done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
6208 gcc_assert (done);
6209 break;
6210
6211 case induc_vec_info_type:
6212 gcc_assert (!slp_node);
6213 done = vectorizable_induction (stmt, bsi, &vec_stmt);
6214 gcc_assert (done);
6215 break;
6216
6217 case op_vec_info_type:
6218 done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
6219 gcc_assert (done);
6220 break;
6221
6222 case assignment_vec_info_type:
6223 done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
6224 gcc_assert (done);
6225 break;
6226
6227 case load_vec_info_type:
6228 done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
6229 gcc_assert (done);
6230 break;
6231
6232 case store_vec_info_type:
6233 done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
6234 gcc_assert (done);
6235 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6236 {
6237 /* In case of interleaving, the whole chain is vectorized when the
6238 last store in the chain is reached. Store stmts before the last
6239 one are skipped, and there vec_stmt_info shouldn't be freed
6240 meanwhile. */
6241 *strided_store = true;
6242 if (STMT_VINFO_VEC_STMT (stmt_info))
6243 is_store = true;
6244 }
6245 else
6246 is_store = true;
6247 break;
6248
6249 case condition_vec_info_type:
6250 gcc_assert (!slp_node);
6251 done = vectorizable_condition (stmt, bsi, &vec_stmt);
6252 gcc_assert (done);
6253 break;
6254
6255 case call_vec_info_type:
6256 gcc_assert (!slp_node);
6257 done = vectorizable_call (stmt, bsi, &vec_stmt);
6258 break;
6259
6260 case reduc_vec_info_type:
6261 gcc_assert (!slp_node);
6262 done = vectorizable_reduction (stmt, bsi, &vec_stmt);
6263 gcc_assert (done);
6264 break;
6265
6266 default:
6267 if (!STMT_VINFO_LIVE_P (stmt_info))
6268 {
6269 if (vect_print_dump_info (REPORT_DETAILS))
6270 fprintf (vect_dump, "stmt not supported.");
6271 gcc_unreachable ();
6272 }
6273 }
6274
6275 if (STMT_VINFO_LIVE_P (stmt_info)
6276 && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
6277 {
6278 done = vectorizable_live_operation (stmt, bsi, &vec_stmt);
6279 gcc_assert (done);
6280 }
6281
6282 if (vec_stmt)
6283 {
6284 STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
6285 orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
6286 if (orig_stmt_in_pattern)
6287 {
6288 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
6289 /* STMT was inserted by the vectorizer to replace a computation idiom.
6290 ORIG_STMT_IN_PATTERN is a stmt in the original sequence that
6291 computed this idiom. We need to record a pointer to VEC_STMT in
6292 the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the
6293 documentation of vect_pattern_recog. */
6294 if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
6295 {
6296 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
6297 STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
6298 }
6299 }
6300 }
6301
6302 return is_store;
6303 }
6304
6305
6306 /* This function builds ni_name = number of iterations loop executes
6307 on the loop preheader. */
6308
6309 static tree
6310 vect_build_loop_niters (loop_vec_info loop_vinfo)
6311 {
6312 tree ni_name, stmt, var;
6313 edge pe;
6314 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6315 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6316
6317 var = create_tmp_var (TREE_TYPE (ni), "niters");
6318 add_referenced_var (var);
6319 ni_name = force_gimple_operand (ni, &stmt, false, var);
6320
6321 pe = loop_preheader_edge (loop);
6322 if (stmt)
6323 {
6324 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6325 gcc_assert (!new_bb);
6326 }
6327
6328 return ni_name;
6329 }
6330
6331
6332 /* This function generates the following statements:
6333
6334 ni_name = number of iterations loop executes
6335 ratio = ni_name / vf
6336 ratio_mult_vf_name = ratio * vf
6337
6338 and places them at the loop preheader edge. */
6339
6340 static void
6341 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6342 tree *ni_name_ptr,
6343 tree *ratio_mult_vf_name_ptr,
6344 tree *ratio_name_ptr)
6345 {
6346
6347 edge pe;
6348 basic_block new_bb;
6349 tree stmt, ni_name;
6350 tree var;
6351 tree ratio_name;
6352 tree ratio_mult_vf_name;
6353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6354 tree ni = LOOP_VINFO_NITERS (loop_vinfo);
6355 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6356 tree log_vf;
6357
6358 pe = loop_preheader_edge (loop);
6359
6360 /* Generate temporary variable that contains
6361 number of iterations loop executes. */
6362
6363 ni_name = vect_build_loop_niters (loop_vinfo);
6364 log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
6365
6366 /* Create: ratio = ni >> log2(vf) */
6367
6368 ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
6369 if (!is_gimple_val (ratio_name))
6370 {
6371 var = create_tmp_var (TREE_TYPE (ni), "bnd");
6372 add_referenced_var (var);
6373
6374 ratio_name = force_gimple_operand (ratio_name, &stmt, true, var);
6375 pe = loop_preheader_edge (loop);
6376 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6377 gcc_assert (!new_bb);
6378 }
6379
6380 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6381
6382 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6383 ratio_name, log_vf);
6384 if (!is_gimple_val (ratio_mult_vf_name))
6385 {
6386 var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
6387 add_referenced_var (var);
6388
6389 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt,
6390 true, var);
6391 pe = loop_preheader_edge (loop);
6392 new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6393 gcc_assert (!new_bb);
6394 }
6395
6396 *ni_name_ptr = ni_name;
6397 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6398 *ratio_name_ptr = ratio_name;
6399
6400 return;
6401 }
6402
6403
6404 /* Function vect_update_ivs_after_vectorizer.
6405
6406 "Advance" the induction variables of LOOP to the value they should take
6407 after the execution of LOOP. This is currently necessary because the
6408 vectorizer does not handle induction variables that are used after the
6409 loop. Such a situation occurs when the last iterations of LOOP are
6410 peeled, because:
6411 1. We introduced new uses after LOOP for IVs that were not originally used
6412 after LOOP: the IVs of LOOP are now used by an epilog loop.
6413 2. LOOP is going to be vectorized; this means that it will iterate N/VF
6414 times, whereas the loop IVs should be bumped N times.
6415
6416 Input:
6417 - LOOP - a loop that is going to be vectorized. The last few iterations
6418 of LOOP were peeled.
6419 - NITERS - the number of iterations that LOOP executes (before it is
6420 vectorized). i.e, the number of times the ivs should be bumped.
6421 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
6422 coming out from LOOP on which there are uses of the LOOP ivs
6423 (this is the path from LOOP->exit to epilog_loop->preheader).
6424
6425 The new definitions of the ivs are placed in LOOP->exit.
6426 The phi args associated with the edge UPDATE_E in the bb
6427 UPDATE_E->dest are updated accordingly.
6428
6429 Assumption 1: Like the rest of the vectorizer, this function assumes
6430 a single loop exit that has a single predecessor.
6431
6432 Assumption 2: The phi nodes in the LOOP header and in update_bb are
6433 organized in the same order.
6434
6435 Assumption 3: The access function of the ivs is simple enough (see
6436 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
6437
6438 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
6439 coming out of LOOP on which the ivs of LOOP are used (this is the path
6440 that leads to the epilog loop; other paths skip the epilog loop). This
6441 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
6442 needs to have its phis updated.
6443 */
6444
6445 static void
6446 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
6447 edge update_e)
6448 {
6449 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6450 basic_block exit_bb = single_exit (loop)->dest;
6451 tree phi, phi1;
6452 basic_block update_bb = update_e->dest;
6453
6454 /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */
6455
6456 /* Make sure there exists a single-predecessor exit bb: */
6457 gcc_assert (single_pred_p (exit_bb));
6458
6459 for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb);
6460 phi && phi1;
6461 phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1))
6462 {
6463 tree access_fn = NULL;
6464 tree evolution_part;
6465 tree init_expr;
6466 tree step_expr;
6467 tree var, ni, ni_name;
6468 block_stmt_iterator last_bsi;
6469
6470 if (vect_print_dump_info (REPORT_DETAILS))
6471 {
6472 fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: ");
6473 print_generic_expr (vect_dump, phi, TDF_SLIM);
6474 }
6475
6476 /* Skip virtual phi's. */
6477 if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi))))
6478 {
6479 if (vect_print_dump_info (REPORT_DETAILS))
6480 fprintf (vect_dump, "virtual phi. skip.");
6481 continue;
6482 }
6483
6484 /* Skip reduction phis. */
6485 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
6486 {
6487 if (vect_print_dump_info (REPORT_DETAILS))
6488 fprintf (vect_dump, "reduc phi. skip.");
6489 continue;
6490 }
6491
6492 access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi));
6493 gcc_assert (access_fn);
6494 evolution_part =
6495 unshare_expr (evolution_part_in_loop_num (access_fn, loop->num));
6496 gcc_assert (evolution_part != NULL_TREE);
6497
6498 /* FORNOW: We do not support IVs whose evolution function is a polynomial
6499 of degree >= 2 or exponential. */
6500 gcc_assert (!tree_is_chrec (evolution_part));
6501
6502 step_expr = evolution_part;
6503 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn,
6504 loop->num));
6505
6506 if (POINTER_TYPE_P (TREE_TYPE (init_expr)))
6507 ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr),
6508 init_expr,
6509 fold_convert (sizetype,
6510 fold_build2 (MULT_EXPR, TREE_TYPE (niters),
6511 niters, step_expr)));
6512 else
6513 ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr),
6514 fold_build2 (MULT_EXPR, TREE_TYPE (init_expr),
6515 fold_convert (TREE_TYPE (init_expr),
6516 niters),
6517 step_expr),
6518 init_expr);
6519
6520
6521
6522 var = create_tmp_var (TREE_TYPE (init_expr), "tmp");
6523 add_referenced_var (var);
6524
6525 last_bsi = bsi_last (exit_bb);
6526 ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var,
6527 true, BSI_SAME_STMT);
6528
6529 /* Fix phi expressions in the successor bb. */
6530 SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name);
6531 }
6532 }
6533
6534 /* Return the more conservative threshold between the
6535 min_profitable_iters returned by the cost model and the user
6536 specified threshold, if provided. */
6537
6538 static unsigned int
6539 conservative_cost_threshold (loop_vec_info loop_vinfo,
6540 int min_profitable_iters)
6541 {
6542 unsigned int th;
6543 int min_scalar_loop_bound;
6544
6545 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
6546 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
6547
6548 /* Use the cost model only if it is more conservative than user specified
6549 threshold. */
6550 th = (unsigned) min_scalar_loop_bound;
6551 if (min_profitable_iters
6552 && (!min_scalar_loop_bound
6553 || min_profitable_iters > min_scalar_loop_bound))
6554 th = (unsigned) min_profitable_iters;
6555
6556 if (th && vect_print_dump_info (REPORT_COST))
6557 fprintf (vect_dump, "Vectorization may not be profitable.");
6558
6559 return th;
6560 }
6561
6562 /* Function vect_do_peeling_for_loop_bound
6563
6564 Peel the last iterations of the loop represented by LOOP_VINFO.
6565 The peeled iterations form a new epilog loop. Given that the loop now
6566 iterates NITERS times, the new epilog loop iterates
6567 NITERS % VECTORIZATION_FACTOR times.
6568
6569 The original loop will later be made to iterate
6570 NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */
6571
6572 static void
6573 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
6574 {
6575 tree ni_name, ratio_mult_vf_name;
6576 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6577 struct loop *new_loop;
6578 edge update_e;
6579 basic_block preheader;
6580 int loop_num;
6581 bool check_profitability = false;
6582 unsigned int th = 0;
6583 int min_profitable_iters;
6584
6585 if (vect_print_dump_info (REPORT_DETAILS))
6586 fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ===");
6587
6588 initialize_original_copy_tables ();
6589
6590 /* Generate the following variables on the preheader of original loop:
6591
6592 ni_name = number of iteration the original loop executes
6593 ratio = ni_name / vf
6594 ratio_mult_vf_name = ratio * vf */
6595 vect_generate_tmps_on_preheader (loop_vinfo, &ni_name,
6596 &ratio_mult_vf_name, ratio);
6597
6598 loop_num = loop->num;
6599
6600 /* If cost model check not done during versioning and
6601 peeling for alignment. */
6602 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6603 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))
6604 && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
6605 {
6606 check_profitability = true;
6607
6608 /* Get profitability threshold for vectorized loop. */
6609 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6610
6611 th = conservative_cost_threshold (loop_vinfo,
6612 min_profitable_iters);
6613 }
6614
6615 new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop),
6616 ratio_mult_vf_name, ni_name, false,
6617 th, check_profitability);
6618 gcc_assert (new_loop);
6619 gcc_assert (loop_num == loop->num);
6620 #ifdef ENABLE_CHECKING
6621 slpeel_verify_cfg_after_peeling (loop, new_loop);
6622 #endif
6623
6624 /* A guard that controls whether the new_loop is to be executed or skipped
6625 is placed in LOOP->exit. LOOP->exit therefore has two successors - one
6626 is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
6627 is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
6628 is on the path where the LOOP IVs are used and need to be updated. */
6629
6630 preheader = loop_preheader_edge (new_loop)->src;
6631 if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
6632 update_e = EDGE_PRED (preheader, 0);
6633 else
6634 update_e = EDGE_PRED (preheader, 1);
6635
6636 /* Update IVs of original loop as if they were advanced
6637 by ratio_mult_vf_name steps. */
6638 vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
6639
6640 /* After peeling we have to reset scalar evolution analyzer. */
6641 scev_reset ();
6642
6643 free_original_copy_tables ();
6644 }
6645
6646
6647 /* Function vect_gen_niters_for_prolog_loop
6648
6649 Set the number of iterations for the loop represented by LOOP_VINFO
6650 to the minimum between LOOP_NITERS (the original iteration count of the loop)
6651 and the misalignment of DR - the data reference recorded in
6652 LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
6653 this loop, the data reference DR will refer to an aligned location.
6654
6655 The following computation is generated:
6656
6657 If the misalignment of DR is known at compile time:
6658 addr_mis = int mis = DR_MISALIGNMENT (dr);
6659 Else, compute address misalignment in bytes:
6660 addr_mis = addr & (vectype_size - 1)
6661
6662 prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) )
6663
6664 (elem_size = element type size; an element is the scalar element
6665 whose type is the inner type of the vectype)
6666
6667 For interleaving,
6668
6669 prolog_niters = min ( LOOP_NITERS ,
6670 (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) )
6671 where group_size is the size of the interleaved group.
6672
6673 The above formulas assume that VF == number of elements in the vector. This
6674 may not hold when there are multiple-types in the loop.
6675 In this case, for some data-references in the loop the VF does not represent
6676 the number of elements that fit in the vector. Therefore, instead of VF we
6677 use TYPE_VECTOR_SUBPARTS. */
6678
6679 static tree
6680 vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
6681 {
6682 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
6683 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6684 tree var, stmt;
6685 tree iters, iters_name;
6686 edge pe;
6687 basic_block new_bb;
6688 tree dr_stmt = DR_STMT (dr);
6689 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
6690 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6691 int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
6692 tree niters_type = TREE_TYPE (loop_niters);
6693 int group_size = 1;
6694 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
6695 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
6696
6697 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
6698 {
6699 /* For interleaved access element size must be multiplied by the size of
6700 the interleaved group. */
6701 group_size = DR_GROUP_SIZE (vinfo_for_stmt (
6702 DR_GROUP_FIRST_DR (stmt_info)));
6703 element_size *= group_size;
6704 }
6705
6706 pe = loop_preheader_edge (loop);
6707
6708 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
6709 {
6710 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
6711 int elem_misalign = byte_misalign / element_size;
6712
6713 if (vect_print_dump_info (REPORT_DETAILS))
6714 fprintf (vect_dump, "known alignment = %d.", byte_misalign);
6715 iters = build_int_cst (niters_type,
6716 (nelements - elem_misalign)&(nelements/group_size-1));
6717 }
6718 else
6719 {
6720 tree new_stmts = NULL_TREE;
6721 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
6722 &new_stmts, NULL_TREE, loop);
6723 tree ptr_type = TREE_TYPE (start_addr);
6724 tree size = TYPE_SIZE (ptr_type);
6725 tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1);
6726 tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1);
6727 tree elem_size_log =
6728 build_int_cst (type, exact_log2 (vectype_align/nelements));
6729 tree nelements_minus_1 = build_int_cst (type, nelements - 1);
6730 tree nelements_tree = build_int_cst (type, nelements);
6731 tree byte_misalign;
6732 tree elem_misalign;
6733
6734 new_bb = bsi_insert_on_edge_immediate (pe, new_stmts);
6735 gcc_assert (!new_bb);
6736
6737 /* Create: byte_misalign = addr & (vectype_size - 1) */
6738 byte_misalign =
6739 fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1);
6740
6741 /* Create: elem_misalign = byte_misalign / element_size */
6742 elem_misalign =
6743 fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
6744
6745 /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
6746 iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
6747 iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
6748 iters = fold_convert (niters_type, iters);
6749 }
6750
6751 /* Create: prolog_loop_niters = min (iters, loop_niters) */
6752 /* If the loop bound is known at compile time we already verified that it is
6753 greater than vf; since the misalignment ('iters') is at most vf, there's
6754 no need to generate the MIN_EXPR in this case. */
6755 if (TREE_CODE (loop_niters) != INTEGER_CST)
6756 iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
6757
6758 if (vect_print_dump_info (REPORT_DETAILS))
6759 {
6760 fprintf (vect_dump, "niters for prolog loop: ");
6761 print_generic_expr (vect_dump, iters, TDF_SLIM);
6762 }
6763
6764 var = create_tmp_var (niters_type, "prolog_loop_niters");
6765 add_referenced_var (var);
6766 iters_name = force_gimple_operand (iters, &stmt, false, var);
6767
6768 /* Insert stmt on loop preheader edge. */
6769 if (stmt)
6770 {
6771 basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt);
6772 gcc_assert (!new_bb);
6773 }
6774
6775 return iters_name;
6776 }
6777
6778
6779 /* Function vect_update_init_of_dr
6780
6781 NITERS iterations were peeled from LOOP. DR represents a data reference
6782 in LOOP. This function updates the information recorded in DR to
6783 account for the fact that the first NITERS iterations had already been
6784 executed. Specifically, it updates the OFFSET field of DR. */
6785
6786 static void
6787 vect_update_init_of_dr (struct data_reference *dr, tree niters)
6788 {
6789 tree offset = DR_OFFSET (dr);
6790
6791 niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr));
6792 offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters);
6793 DR_OFFSET (dr) = offset;
6794 }
6795
6796
6797 /* Function vect_update_inits_of_drs
6798
6799 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
6800 This function updates the information recorded for the data references in
6801 the loop to account for the fact that the first NITERS iterations had
6802 already been executed. Specifically, it updates the initial_condition of
6803 the access_function of all the data_references in the loop. */
6804
6805 static void
6806 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
6807 {
6808 unsigned int i;
6809 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
6810 struct data_reference *dr;
6811
6812 if (vect_print_dump_info (REPORT_DETAILS))
6813 fprintf (vect_dump, "=== vect_update_inits_of_dr ===");
6814
6815 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
6816 vect_update_init_of_dr (dr, niters);
6817 }
6818
6819
6820 /* Function vect_do_peeling_for_alignment
6821
6822 Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
6823 'niters' is set to the misalignment of one of the data references in the
6824 loop, thereby forcing it to refer to an aligned location at the beginning
6825 of the execution of this loop. The data reference for which we are
6826 peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */
6827
6828 static void
6829 vect_do_peeling_for_alignment (loop_vec_info loop_vinfo)
6830 {
6831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6832 tree niters_of_prolog_loop, ni_name;
6833 tree n_iters;
6834 struct loop *new_loop;
6835 bool check_profitability = false;
6836 unsigned int th = 0;
6837 int min_profitable_iters;
6838
6839 if (vect_print_dump_info (REPORT_DETAILS))
6840 fprintf (vect_dump, "=== vect_do_peeling_for_alignment ===");
6841
6842 initialize_original_copy_tables ();
6843
6844 ni_name = vect_build_loop_niters (loop_vinfo);
6845 niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name);
6846
6847
6848 /* If cost model check not done during versioning. */
6849 if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
6850 && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
6851 {
6852 check_profitability = true;
6853
6854 /* Get profitability threshold for vectorized loop. */
6855 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
6856
6857 th = conservative_cost_threshold (loop_vinfo,
6858 min_profitable_iters);
6859 }
6860
6861 /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
6862 new_loop =
6863 slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop),
6864 niters_of_prolog_loop, ni_name, true,
6865 th, check_profitability);
6866
6867 gcc_assert (new_loop);
6868 #ifdef ENABLE_CHECKING
6869 slpeel_verify_cfg_after_peeling (new_loop, loop);
6870 #endif
6871
6872 /* Update number of times loop executes. */
6873 n_iters = LOOP_VINFO_NITERS (loop_vinfo);
6874 LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
6875 TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop);
6876
6877 /* Update the init conditions of the access functions of all data refs. */
6878 vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop);
6879
6880 /* After peeling we have to reset scalar evolution analyzer. */
6881 scev_reset ();
6882
6883 free_original_copy_tables ();
6884 }
6885
6886
6887 /* Function vect_create_cond_for_align_checks.
6888
6889 Create a conditional expression that represents the alignment checks for
6890 all of data references (array element references) whose alignment must be
6891 checked at runtime.
6892
6893 Input:
6894 COND_EXPR - input conditional expression. New conditions will be chained
6895 with logical AND operation.
6896 LOOP_VINFO - two fields of the loop information are used.
6897 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
6898 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
6899
6900 Output:
6901 COND_EXPR_STMT_LIST - statements needed to construct the conditional
6902 expression.
6903 The returned value is the conditional expression to be used in the if
6904 statement that controls which version of the loop gets executed at runtime.
6905
6906 The algorithm makes two assumptions:
6907 1) The number of bytes "n" in a vector is a power of 2.
6908 2) An address "a" is aligned if a%n is zero and that this
6909 test can be done as a&(n-1) == 0. For example, for 16
6910 byte vectors the test is a&0xf == 0. */
6911
6912 static void
6913 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
6914 tree *cond_expr,
6915 tree *cond_expr_stmt_list)
6916 {
6917 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6918 VEC(tree,heap) *may_misalign_stmts
6919 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
6920 tree ref_stmt, tmp;
6921 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
6922 tree mask_cst;
6923 unsigned int i;
6924 tree psize;
6925 tree int_ptrsize_type;
6926 char tmp_name[20];
6927 tree or_tmp_name = NULL_TREE;
6928 tree and_tmp, and_tmp_name, and_stmt;
6929 tree ptrsize_zero;
6930 tree part_cond_expr;
6931
6932 /* Check that mask is one less than a power of 2, i.e., mask is
6933 all zeros followed by all ones. */
6934 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
6935
6936 /* CHECKME: what is the best integer or unsigned type to use to hold a
6937 cast from a pointer value? */
6938 psize = TYPE_SIZE (ptr_type_node);
6939 int_ptrsize_type
6940 = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0);
6941
6942 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
6943 of the first vector of the i'th data reference. */
6944
6945 for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++)
6946 {
6947 tree new_stmt_list = NULL_TREE;
6948 tree addr_base;
6949 tree addr_tmp, addr_tmp_name, addr_stmt;
6950 tree or_tmp, new_or_tmp_name, or_stmt;
6951
6952 /* create: addr_tmp = (int)(address_of_first_vector) */
6953 addr_base = vect_create_addr_base_for_vector_ref (ref_stmt,
6954 &new_stmt_list, NULL_TREE, loop);
6955
6956 if (new_stmt_list != NULL_TREE)
6957 append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list);
6958
6959 sprintf (tmp_name, "%s%d", "addr2int", i);
6960 addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6961 add_referenced_var (addr_tmp);
6962 addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE);
6963 addr_stmt = fold_convert (int_ptrsize_type, addr_base);
6964 addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt);
6965 SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt;
6966 append_to_statement_list_force (addr_stmt, cond_expr_stmt_list);
6967
6968 /* The addresses are OR together. */
6969
6970 if (or_tmp_name != NULL_TREE)
6971 {
6972 /* create: or_tmp = or_tmp | addr_tmp */
6973 sprintf (tmp_name, "%s%d", "orptrs", i);
6974 or_tmp = create_tmp_var (int_ptrsize_type, tmp_name);
6975 add_referenced_var (or_tmp);
6976 new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE);
6977 tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type,
6978 or_tmp_name, addr_tmp_name);
6979 or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp);
6980 SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt;
6981 append_to_statement_list_force (or_stmt, cond_expr_stmt_list);
6982 or_tmp_name = new_or_tmp_name;
6983 }
6984 else
6985 or_tmp_name = addr_tmp_name;
6986
6987 } /* end for i */
6988
6989 mask_cst = build_int_cst (int_ptrsize_type, mask);
6990
6991 /* create: and_tmp = or_tmp & mask */
6992 and_tmp = create_tmp_var (int_ptrsize_type, "andmask" );
6993 add_referenced_var (and_tmp);
6994 and_tmp_name = make_ssa_name (and_tmp, NULL_TREE);
6995
6996 tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst);
6997 and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp);
6998 SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt;
6999 append_to_statement_list_force (and_stmt, cond_expr_stmt_list);
7000
7001 /* Make and_tmp the left operand of the conditional test against zero.
7002 if and_tmp has a nonzero bit then some address is unaligned. */
7003 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
7004 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
7005 and_tmp_name, ptrsize_zero);
7006 if (*cond_expr)
7007 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7008 *cond_expr, part_cond_expr);
7009 else
7010 *cond_expr = part_cond_expr;
7011 }
7012
7013 /* Function vect_vfa_segment_size.
7014
7015 Create an expression that computes the size of segment
7016 that will be accessed for a data reference. The functions takes into
7017 account that realignment loads may access one more vector.
7018
7019 Input:
7020 DR: The data reference.
7021 VECT_FACTOR: vectorization factor.
7022
7023 Return an expression whose value is the size of segment which will be
7024 accessed by DR. */
7025
7026 static tree
7027 vect_vfa_segment_size (struct data_reference *dr, tree vect_factor)
7028 {
7029 tree segment_length = fold_build2 (MULT_EXPR, integer_type_node,
7030 DR_STEP (dr), vect_factor);
7031
7032 if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized)
7033 {
7034 tree vector_size = TYPE_SIZE_UNIT
7035 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
7036
7037 segment_length = fold_build2 (PLUS_EXPR, integer_type_node,
7038 segment_length, vector_size);
7039 }
7040 return fold_convert (sizetype, segment_length);
7041 }
7042
7043 /* Function vect_create_cond_for_alias_checks.
7044
7045 Create a conditional expression that represents the run-time checks for
7046 overlapping of address ranges represented by a list of data references
7047 relations passed as input.
7048
7049 Input:
7050 COND_EXPR - input conditional expression. New conditions will be chained
7051 with logical AND operation.
7052 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
7053 to be checked.
7054
7055 Output:
7056 COND_EXPR - conditional expression.
7057 COND_EXPR_STMT_LIST - statements needed to construct the conditional
7058 expression.
7059
7060
7061 The returned value is the conditional expression to be used in the if
7062 statement that controls which version of the loop gets executed at runtime.
7063 */
7064
7065 static void
7066 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
7067 tree * cond_expr,
7068 tree * cond_expr_stmt_list)
7069 {
7070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7071 VEC (ddr_p, heap) * may_alias_ddrs =
7072 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
7073 tree vect_factor =
7074 build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
7075
7076 ddr_p ddr;
7077 unsigned int i;
7078 tree part_cond_expr;
7079
7080 /* Create expression
7081 ((store_ptr_0 + store_segment_length_0) < load_ptr_0)
7082 || (load_ptr_0 + load_segment_length_0) < store_ptr_0))
7083 &&
7084 ...
7085 &&
7086 ((store_ptr_n + store_segment_length_n) < load_ptr_n)
7087 || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */
7088
7089 if (VEC_empty (ddr_p, may_alias_ddrs))
7090 return;
7091
7092 for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++)
7093 {
7094 struct data_reference *dr_a, *dr_b;
7095 tree dr_group_first_a, dr_group_first_b;
7096 tree addr_base_a, addr_base_b;
7097 tree segment_length_a, segment_length_b;
7098 tree stmt_a, stmt_b;
7099
7100 dr_a = DDR_A (ddr);
7101 stmt_a = DR_STMT (DDR_A (ddr));
7102 dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a));
7103 if (dr_group_first_a)
7104 {
7105 stmt_a = dr_group_first_a;
7106 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
7107 }
7108
7109 dr_b = DDR_B (ddr);
7110 stmt_b = DR_STMT (DDR_B (ddr));
7111 dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b));
7112 if (dr_group_first_b)
7113 {
7114 stmt_b = dr_group_first_b;
7115 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
7116 }
7117
7118 addr_base_a =
7119 vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list,
7120 NULL_TREE, loop);
7121 addr_base_b =
7122 vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list,
7123 NULL_TREE, loop);
7124
7125 segment_length_a = vect_vfa_segment_size (dr_a, vect_factor);
7126 segment_length_b = vect_vfa_segment_size (dr_b, vect_factor);
7127
7128 if (vect_print_dump_info (REPORT_DR_DETAILS))
7129 {
7130 fprintf (vect_dump,
7131 "create runtime check for data references ");
7132 print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM);
7133 fprintf (vect_dump, " and ");
7134 print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM);
7135 }
7136
7137
7138 part_cond_expr =
7139 fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
7140 fold_build2 (LT_EXPR, boolean_type_node,
7141 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a),
7142 addr_base_a,
7143 segment_length_a),
7144 addr_base_b),
7145 fold_build2 (LT_EXPR, boolean_type_node,
7146 fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b),
7147 addr_base_b,
7148 segment_length_b),
7149 addr_base_a));
7150
7151 if (*cond_expr)
7152 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
7153 *cond_expr, part_cond_expr);
7154 else
7155 *cond_expr = part_cond_expr;
7156 }
7157 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7158 fprintf (vect_dump, "created %u versioning for alias checks.\n",
7159 VEC_length (ddr_p, may_alias_ddrs));
7160
7161 }
7162
7163 /* Function vect_loop_versioning.
7164
7165 If the loop has data references that may or may not be aligned or/and
7166 has data reference relations whose independence was not proven then
7167 two versions of the loop need to be generated, one which is vectorized
7168 and one which isn't. A test is then generated to control which of the
7169 loops is executed. The test checks for the alignment of all of the
7170 data references that may or may not be aligned. An additional
7171 sequence of runtime tests is generated for each pairs of DDRs whose
7172 independence was not proven. The vectorized version of loop is
7173 executed only if both alias and alignment tests are passed.
7174
7175 The test generated to check which version of loop is executed
7176 is modified to also check for profitability as indicated by the
7177 cost model initially. */
7178
7179 static void
7180 vect_loop_versioning (loop_vec_info loop_vinfo)
7181 {
7182 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7183 struct loop *nloop;
7184 tree cond_expr = NULL_TREE;
7185 tree cond_expr_stmt_list = NULL_TREE;
7186 basic_block condition_bb;
7187 block_stmt_iterator cond_exp_bsi;
7188 basic_block merge_bb;
7189 basic_block new_exit_bb;
7190 edge new_exit_e, e;
7191 tree orig_phi, new_phi, arg;
7192 unsigned prob = 4 * REG_BR_PROB_BASE / 5;
7193 tree gimplify_stmt_list;
7194 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
7195 int min_profitable_iters = 0;
7196 unsigned int th;
7197
7198 /* Get profitability threshold for vectorized loop. */
7199 min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
7200
7201 th = conservative_cost_threshold (loop_vinfo,
7202 min_profitable_iters);
7203
7204 cond_expr =
7205 build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
7206 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
7207
7208 cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list,
7209 false, NULL_TREE);
7210
7211 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
7212 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
7213 &cond_expr_stmt_list);
7214
7215 if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7216 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr,
7217 &cond_expr_stmt_list);
7218
7219 cond_expr =
7220 fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node);
7221 cond_expr =
7222 force_gimple_operand (cond_expr, &gimplify_stmt_list, true,
7223 NULL_TREE);
7224 append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list);
7225
7226 initialize_original_copy_tables ();
7227 nloop = loop_version (loop, cond_expr, &condition_bb,
7228 prob, prob, REG_BR_PROB_BASE - prob, true);
7229 free_original_copy_tables();
7230
7231 /* Loop versioning violates an assumption we try to maintain during
7232 vectorization - that the loop exit block has a single predecessor.
7233 After versioning, the exit block of both loop versions is the same
7234 basic block (i.e. it has two predecessors). Just in order to simplify
7235 following transformations in the vectorizer, we fix this situation
7236 here by adding a new (empty) block on the exit-edge of the loop,
7237 with the proper loop-exit phis to maintain loop-closed-form. */
7238
7239 merge_bb = single_exit (loop)->dest;
7240 gcc_assert (EDGE_COUNT (merge_bb->preds) == 2);
7241 new_exit_bb = split_edge (single_exit (loop));
7242 new_exit_e = single_exit (loop);
7243 e = EDGE_SUCC (new_exit_bb, 0);
7244
7245 for (orig_phi = phi_nodes (merge_bb); orig_phi;
7246 orig_phi = PHI_CHAIN (orig_phi))
7247 {
7248 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
7249 new_exit_bb);
7250 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
7251 add_phi_arg (new_phi, arg, new_exit_e);
7252 SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi));
7253 }
7254
7255 /* End loop-exit-fixes after versioning. */
7256
7257 update_ssa (TODO_update_ssa);
7258 if (cond_expr_stmt_list)
7259 {
7260 cond_exp_bsi = bsi_last (condition_bb);
7261 bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT);
7262 }
7263 }
7264
7265 /* Remove a group of stores (for SLP or interleaving), free their
7266 stmt_vec_info. */
7267
7268 static void
7269 vect_remove_stores (tree first_stmt)
7270 {
7271 stmt_ann_t ann;
7272 tree next = first_stmt;
7273 tree tmp;
7274 stmt_vec_info next_stmt_info;
7275 block_stmt_iterator next_si;
7276
7277 while (next)
7278 {
7279 /* Free the attached stmt_vec_info and remove the stmt. */
7280 next_si = bsi_for_stmt (next);
7281 bsi_remove (&next_si, true);
7282 next_stmt_info = vinfo_for_stmt (next);
7283 ann = stmt_ann (next);
7284 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7285 free (next_stmt_info);
7286 set_stmt_info (ann, NULL);
7287 next = tmp;
7288 }
7289 }
7290
7291
7292 /* Vectorize SLP instance tree in postorder. */
7293
7294 static bool
7295 vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
7296 {
7297 tree stmt;
7298 bool strided_store, is_store;
7299 block_stmt_iterator si;
7300 stmt_vec_info stmt_info;
7301
7302 if (!node)
7303 return false;
7304
7305 vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
7306 vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
7307
7308 stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
7309 stmt_info = vinfo_for_stmt (stmt);
7310 SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
7311 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
7312
7313 if (vect_print_dump_info (REPORT_DETAILS))
7314 {
7315 fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
7316 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7317 }
7318
7319 si = bsi_for_stmt (stmt);
7320 is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
7321 if (is_store)
7322 {
7323 if (DR_GROUP_FIRST_DR (stmt_info))
7324 /* If IS_STORE is TRUE, the vectorization of the
7325 interleaving chain was completed - free all the stores in
7326 the chain. */
7327 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
7328 else
7329 /* FORNOW: SLP originates only from strided stores. */
7330 gcc_unreachable ();
7331
7332 return true;
7333 }
7334
7335 /* FORNOW: SLP originates only from strided stores. */
7336 return false;
7337 }
7338
7339
7340 static bool
7341 vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
7342 {
7343 VEC (slp_instance, heap) *slp_instances =
7344 LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7345 slp_instance instance;
7346 unsigned int vec_stmts_size;
7347 unsigned int group_size, i;
7348 unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7349 bool is_store = false;
7350
7351 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
7352 {
7353 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
7354 /* For each SLP instance calculate number of vector stmts to be created
7355 for the scalar stmts in each node of the SLP tree. Number of vector
7356 elements in one vector iteration is the number of scalar elements in
7357 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
7358 size. */
7359 vec_stmts_size = vectorization_factor * group_size / nunits;
7360
7361 /* Schedule the tree of INSTANCE. */
7362 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
7363 vec_stmts_size);
7364
7365 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
7366 || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
7367 fprintf (vect_dump, "vectorizing stmts using SLP.");
7368 }
7369
7370 return is_store;
7371 }
7372
7373 /* Function vect_transform_loop.
7374
7375 The analysis phase has determined that the loop is vectorizable.
7376 Vectorize the loop - created vectorized stmts to replace the scalar
7377 stmts in the loop, and update the loop exit condition. */
7378
7379 void
7380 vect_transform_loop (loop_vec_info loop_vinfo)
7381 {
7382 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7383 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7384 int nbbs = loop->num_nodes;
7385 block_stmt_iterator si, next_si;
7386 int i;
7387 tree ratio = NULL;
7388 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7389 bool strided_store;
7390 bool slp_scheduled = false;
7391 unsigned int nunits;
7392
7393 if (vect_print_dump_info (REPORT_DETAILS))
7394 fprintf (vect_dump, "=== vec_transform_loop ===");
7395
7396 if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
7397 || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)))
7398 vect_loop_versioning (loop_vinfo);
7399
7400 /* CHECKME: we wouldn't need this if we called update_ssa once
7401 for all loops. */
7402 bitmap_zero (vect_memsyms_to_rename);
7403
7404 /* Peel the loop if there are data refs with unknown alignment.
7405 Only one data ref with unknown store is allowed. */
7406
7407 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
7408 vect_do_peeling_for_alignment (loop_vinfo);
7409
7410 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
7411 compile time constant), or it is a constant that doesn't divide by the
7412 vectorization factor, then an epilog loop needs to be created.
7413 We therefore duplicate the loop: the original loop will be vectorized,
7414 and will compute the first (n/VF) iterations. The second copy of the loop
7415 will remain scalar and will compute the remaining (n%VF) iterations.
7416 (VF is the vectorization factor). */
7417
7418 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7419 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7420 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0))
7421 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio);
7422 else
7423 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7424 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
7425
7426 /* 1) Make sure the loop header has exactly two entries
7427 2) Make sure we have a preheader basic block. */
7428
7429 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7430
7431 split_edge (loop_preheader_edge (loop));
7432
7433 /* FORNOW: the vectorizer supports only loops which body consist
7434 of one basic block (header + empty latch). When the vectorizer will
7435 support more involved loop forms, the order by which the BBs are
7436 traversed need to be reconsidered. */
7437
7438 for (i = 0; i < nbbs; i++)
7439 {
7440 basic_block bb = bbs[i];
7441 stmt_vec_info stmt_info;
7442 tree phi;
7443
7444 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
7445 {
7446 if (vect_print_dump_info (REPORT_DETAILS))
7447 {
7448 fprintf (vect_dump, "------>vectorizing phi: ");
7449 print_generic_expr (vect_dump, phi, TDF_SLIM);
7450 }
7451 stmt_info = vinfo_for_stmt (phi);
7452 if (!stmt_info)
7453 continue;
7454
7455 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7456 && !STMT_VINFO_LIVE_P (stmt_info))
7457 continue;
7458
7459 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7460 != (unsigned HOST_WIDE_INT) vectorization_factor)
7461 && vect_print_dump_info (REPORT_DETAILS))
7462 fprintf (vect_dump, "multiple-types.");
7463
7464 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
7465 {
7466 if (vect_print_dump_info (REPORT_DETAILS))
7467 fprintf (vect_dump, "transform phi.");
7468 vect_transform_stmt (phi, NULL, NULL, NULL);
7469 }
7470 }
7471
7472 for (si = bsi_start (bb); !bsi_end_p (si);)
7473 {
7474 tree stmt = bsi_stmt (si);
7475 bool is_store;
7476
7477 if (vect_print_dump_info (REPORT_DETAILS))
7478 {
7479 fprintf (vect_dump, "------>vectorizing statement: ");
7480 print_generic_expr (vect_dump, stmt, TDF_SLIM);
7481 }
7482
7483 stmt_info = vinfo_for_stmt (stmt);
7484
7485 /* vector stmts created in the outer-loop during vectorization of
7486 stmts in an inner-loop may not have a stmt_info, and do not
7487 need to be vectorized. */
7488 if (!stmt_info)
7489 {
7490 bsi_next (&si);
7491 continue;
7492 }
7493
7494 if (!STMT_VINFO_RELEVANT_P (stmt_info)
7495 && !STMT_VINFO_LIVE_P (stmt_info))
7496 {
7497 bsi_next (&si);
7498 continue;
7499 }
7500
7501 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
7502 nunits =
7503 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7504 if (!STMT_SLP_TYPE (stmt_info)
7505 && nunits != (unsigned int) vectorization_factor
7506 && vect_print_dump_info (REPORT_DETAILS))
7507 /* For SLP VF is set according to unrolling factor, and not to
7508 vector size, hence for SLP this print is not valid. */
7509 fprintf (vect_dump, "multiple-types.");
7510
7511 /* SLP. Schedule all the SLP instances when the first SLP stmt is
7512 reached. */
7513 if (STMT_SLP_TYPE (stmt_info))
7514 {
7515 if (!slp_scheduled)
7516 {
7517 slp_scheduled = true;
7518
7519 if (vect_print_dump_info (REPORT_DETAILS))
7520 fprintf (vect_dump, "=== scheduling SLP instances ===");
7521
7522 is_store = vect_schedule_slp (loop_vinfo, nunits);
7523
7524 /* IS_STORE is true if STMT is a store. Stores cannot be of
7525 hybrid SLP type. They are removed in
7526 vect_schedule_slp_instance and their vinfo is destroyed. */
7527 if (is_store)
7528 {
7529 bsi_next (&si);
7530 continue;
7531 }
7532 }
7533
7534 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
7535 if (PURE_SLP_STMT (stmt_info))
7536 {
7537 bsi_next (&si);
7538 continue;
7539 }
7540 }
7541
7542 /* -------- vectorize statement ------------ */
7543 if (vect_print_dump_info (REPORT_DETAILS))
7544 fprintf (vect_dump, "transform statement.");
7545
7546 strided_store = false;
7547 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
7548 if (is_store)
7549 {
7550 stmt_ann_t ann;
7551 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
7552 {
7553 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7554 interleaving chain was completed - free all the stores in
7555 the chain. */
7556 tree next = DR_GROUP_FIRST_DR (stmt_info);
7557 tree tmp;
7558 stmt_vec_info next_stmt_info;
7559
7560 while (next)
7561 {
7562 next_si = bsi_for_stmt (next);
7563 next_stmt_info = vinfo_for_stmt (next);
7564 /* Free the attached stmt_vec_info and remove the stmt. */
7565 ann = stmt_ann (next);
7566 tmp = DR_GROUP_NEXT_DR (next_stmt_info);
7567 free (next_stmt_info);
7568 set_stmt_info (ann, NULL);
7569 bsi_remove (&next_si, true);
7570 next = tmp;
7571 }
7572 bsi_remove (&si, true);
7573 continue;
7574 }
7575 else
7576 {
7577 /* Free the attached stmt_vec_info and remove the stmt. */
7578 ann = stmt_ann (stmt);
7579 free (stmt_info);
7580 set_stmt_info (ann, NULL);
7581 bsi_remove (&si, true);
7582 continue;
7583 }
7584 }
7585 bsi_next (&si);
7586 } /* stmts in BB */
7587 } /* BBs in loop */
7588
7589 slpeel_make_loop_iterate_ntimes (loop, ratio);
7590
7591 mark_set_for_renaming (vect_memsyms_to_rename);
7592
7593 /* The memory tags and pointers in vectorized statements need to
7594 have their SSA forms updated. FIXME, why can't this be delayed
7595 until all the loops have been transformed? */
7596 update_ssa (TODO_update_ssa);
7597
7598 if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7599 fprintf (vect_dump, "LOOP VECTORIZED.");
7600 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS))
7601 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
7602 }