tree-vect-slp.c (vect_build_slp_tree_1): Remove bailout on gaps.
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "input.h"
28 #include "alias.h"
29 #include "symtab.h"
30 #include "tree.h"
31 #include "fold-const.h"
32 #include "stor-layout.h"
33 #include "target.h"
34 #include "predict.h"
35 #include "hard-reg-set.h"
36 #include "function.h"
37 #include "basic-block.h"
38 #include "gimple-pretty-print.h"
39 #include "tree-ssa-alias.h"
40 #include "internal-fn.h"
41 #include "gimple-expr.h"
42 #include "is-a.h"
43 #include "gimple.h"
44 #include "gimple-iterator.h"
45 #include "gimple-ssa.h"
46 #include "tree-phinodes.h"
47 #include "ssa-iterators.h"
48 #include "stringpool.h"
49 #include "tree-ssanames.h"
50 #include "tree-pass.h"
51 #include "cfgloop.h"
52 #include "rtl.h"
53 #include "flags.h"
54 #include "insn-config.h"
55 #include "expmed.h"
56 #include "dojump.h"
57 #include "explow.h"
58 #include "calls.h"
59 #include "emit-rtl.h"
60 #include "varasm.h"
61 #include "stmt.h"
62 #include "expr.h"
63 #include "recog.h" /* FIXME: for insn_data */
64 #include "insn-codes.h"
65 #include "optabs.h"
66 #include "tree-vectorizer.h"
67 #include "langhooks.h"
68 #include "gimple-walk.h"
69
70 /* Extract the location of the basic block in the source code.
71 Return the basic block location if succeed and NULL if not. */
72
73 source_location
74 find_bb_location (basic_block bb)
75 {
76 gimple stmt = NULL;
77 gimple_stmt_iterator si;
78
79 if (!bb)
80 return UNKNOWN_LOCATION;
81
82 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
83 {
84 stmt = gsi_stmt (si);
85 if (gimple_location (stmt) != UNKNOWN_LOCATION)
86 return gimple_location (stmt);
87 }
88
89 return UNKNOWN_LOCATION;
90 }
91
92
93 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
94
95 static void
96 vect_free_slp_tree (slp_tree node)
97 {
98 int i;
99 slp_tree child;
100
101 if (!node)
102 return;
103
104 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
105 vect_free_slp_tree (child);
106
107 SLP_TREE_CHILDREN (node).release ();
108 SLP_TREE_SCALAR_STMTS (node).release ();
109 SLP_TREE_VEC_STMTS (node).release ();
110 SLP_TREE_LOAD_PERMUTATION (node).release ();
111
112 free (node);
113 }
114
115
116 /* Free the memory allocated for the SLP instance. */
117
118 void
119 vect_free_slp_instance (slp_instance instance)
120 {
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
122 SLP_INSTANCE_LOADS (instance).release ();
123 free (instance);
124 }
125
126
127 /* Create an SLP node for SCALAR_STMTS. */
128
129 static slp_tree
130 vect_create_new_slp_node (vec<gimple> scalar_stmts)
131 {
132 slp_tree node;
133 gimple stmt = scalar_stmts[0];
134 unsigned int nops;
135
136 if (is_gimple_call (stmt))
137 nops = gimple_call_num_args (stmt);
138 else if (is_gimple_assign (stmt))
139 {
140 nops = gimple_num_ops (stmt) - 1;
141 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
142 nops++;
143 }
144 else
145 return NULL;
146
147 node = XNEW (struct _slp_tree);
148 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
149 SLP_TREE_VEC_STMTS (node).create (0);
150 SLP_TREE_CHILDREN (node).create (nops);
151 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
152 SLP_TREE_TWO_OPERATORS (node) = false;
153
154 return node;
155 }
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
177 }
178
179 return oprnds_info;
180 }
181
182
183 /* Free operands info. */
184
185 static void
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
192 {
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
195 }
196
197 oprnds_info.release ();
198 }
199
200
201 /* Find the place of the data-ref in STMT in the interleaving chain that starts
202 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
203
204 static int
205 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
206 {
207 gimple next_stmt = first_stmt;
208 int result = 0;
209
210 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
211 return -1;
212
213 do
214 {
215 if (next_stmt == stmt)
216 return result;
217 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
218 if (next_stmt)
219 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
220 }
221 while (next_stmt);
222
223 return -1;
224 }
225
226
227 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
228 they are of a valid type and that they match the defs of the first stmt of
229 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
230 return -1, if the error could be corrected by swapping operands of the
231 operation return 1, if everything is ok return 0. */
232
233 static int
234 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
235 gimple stmt, unsigned stmt_num,
236 vec<slp_oprnd_info> *oprnds_info)
237 {
238 tree oprnd;
239 unsigned int i, number_of_oprnds;
240 tree def;
241 gimple def_stmt;
242 enum vect_def_type dt = vect_uninitialized_def;
243 struct loop *loop = NULL;
244 bool pattern = false;
245 slp_oprnd_info oprnd_info;
246 int first_op_idx = 1;
247 bool commutative = false;
248 bool first_op_cond = false;
249 bool first = stmt_num == 0;
250 bool second = stmt_num == 1;
251
252 if (loop_vinfo)
253 loop = LOOP_VINFO_LOOP (loop_vinfo);
254
255 if (is_gimple_call (stmt))
256 {
257 number_of_oprnds = gimple_call_num_args (stmt);
258 first_op_idx = 3;
259 }
260 else if (is_gimple_assign (stmt))
261 {
262 enum tree_code code = gimple_assign_rhs_code (stmt);
263 number_of_oprnds = gimple_num_ops (stmt) - 1;
264 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
265 {
266 first_op_cond = true;
267 commutative = true;
268 number_of_oprnds++;
269 }
270 else
271 commutative = commutative_tree_code (code);
272 }
273 else
274 return -1;
275
276 bool swapped = false;
277 for (i = 0; i < number_of_oprnds; i++)
278 {
279 again:
280 if (first_op_cond)
281 {
282 if (i == 0 || i == 1)
283 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
284 swapped ? !i : i);
285 else
286 oprnd = gimple_op (stmt, first_op_idx + i - 1);
287 }
288 else
289 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
290
291 oprnd_info = (*oprnds_info)[i];
292
293 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
294 &def, &dt))
295 {
296 if (dump_enabled_p ())
297 {
298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
299 "Build SLP failed: can't analyze def for ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
302 }
303
304 return -1;
305 }
306
307 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
308 from the pattern. Check that all the stmts of the node are in the
309 pattern. */
310 if (def_stmt && gimple_bb (def_stmt)
311 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
312 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
313 && gimple_code (def_stmt) != GIMPLE_PHI))
314 && vinfo_for_stmt (def_stmt)
315 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
316 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
317 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
318 {
319 pattern = true;
320 if (!first && !oprnd_info->first_pattern
321 /* Allow different pattern state for the defs of the
322 first stmt in reduction chains. */
323 && (oprnd_info->first_dt != vect_reduction_def
324 || (!second && !oprnd_info->second_pattern)))
325 {
326 if (i == 0
327 && !swapped
328 && commutative)
329 {
330 swapped = true;
331 goto again;
332 }
333
334 if (dump_enabled_p ())
335 {
336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
337 "Build SLP failed: some of the stmts"
338 " are in a pattern, and others are not ");
339 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
340 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
341 }
342
343 return 1;
344 }
345
346 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
347 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
348
349 if (dt == vect_unknown_def_type)
350 {
351 if (dump_enabled_p ())
352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
353 "Unsupported pattern.\n");
354 return -1;
355 }
356
357 switch (gimple_code (def_stmt))
358 {
359 case GIMPLE_PHI:
360 def = gimple_phi_result (def_stmt);
361 break;
362
363 case GIMPLE_ASSIGN:
364 def = gimple_assign_lhs (def_stmt);
365 break;
366
367 default:
368 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "unsupported defining stmt:\n");
371 return -1;
372 }
373 }
374
375 if (second)
376 oprnd_info->second_pattern = pattern;
377
378 if (first)
379 {
380 oprnd_info->first_dt = dt;
381 oprnd_info->first_pattern = pattern;
382 oprnd_info->first_op_type = TREE_TYPE (oprnd);
383 }
384 else
385 {
386 /* Not first stmt of the group, check that the def-stmt/s match
387 the def-stmt/s of the first stmt. Allow different definition
388 types for reduction chains: the first stmt must be a
389 vect_reduction_def (a phi node), and the rest
390 vect_internal_def. */
391 if (((oprnd_info->first_dt != dt
392 && !(oprnd_info->first_dt == vect_reduction_def
393 && dt == vect_internal_def)
394 && !((oprnd_info->first_dt == vect_external_def
395 || oprnd_info->first_dt == vect_constant_def)
396 && (dt == vect_external_def
397 || dt == vect_constant_def)))
398 || !types_compatible_p (oprnd_info->first_op_type,
399 TREE_TYPE (oprnd))))
400 {
401 /* Try swapping operands if we got a mismatch. */
402 if (i == 0
403 && !swapped
404 && commutative)
405 {
406 swapped = true;
407 goto again;
408 }
409
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
413
414 return 1;
415 }
416 }
417
418 /* Check the types of the definitions. */
419 switch (dt)
420 {
421 case vect_constant_def:
422 case vect_external_def:
423 case vect_reduction_def:
424 break;
425
426 case vect_internal_def:
427 oprnd_info->def_stmts.quick_push (def_stmt);
428 break;
429
430 default:
431 /* FORNOW: Not supported. */
432 if (dump_enabled_p ())
433 {
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: illegal type of def ");
436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438 }
439
440 return -1;
441 }
442 }
443
444 /* Swap operands. */
445 if (swapped)
446 {
447 if (first_op_cond)
448 {
449 tree cond = gimple_assign_rhs1 (stmt);
450 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451 &TREE_OPERAND (cond, 1));
452 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453 }
454 else
455 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456 gimple_assign_rhs2_ptr (stmt));
457 }
458
459 return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464 permutation or are of unsupported types of operation. Return
465 true if they are, otherwise return false and indicate in *MATCHES
466 which stmts are not isomorphic to the first one. If MATCHES[0]
467 is false then this indicates the comparison could not be
468 carried out or the stmts will never be vectorized by SLP. */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472 vec<gimple> stmts, unsigned int group_size,
473 unsigned nops, unsigned int *max_nunits,
474 unsigned int vectorization_factor, bool *matches,
475 bool *two_operators)
476 {
477 unsigned int i;
478 gimple first_stmt = stmts[0], stmt = stmts[0];
479 enum tree_code first_stmt_code = ERROR_MARK;
480 enum tree_code alt_stmt_code = ERROR_MARK;
481 enum tree_code rhs_code = ERROR_MARK;
482 enum tree_code first_cond_code = ERROR_MARK;
483 tree lhs;
484 bool need_same_oprnds = false;
485 tree vectype, scalar_type, first_op1 = NULL_TREE;
486 optab optab;
487 int icode;
488 machine_mode optab_op2_mode;
489 machine_mode vec_mode;
490 struct data_reference *first_dr;
491 HOST_WIDE_INT dummy;
492 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
493 tree cond;
494
495 /* For every stmt in NODE find its def stmt/s. */
496 FOR_EACH_VEC_ELT (stmts, i, stmt)
497 {
498 matches[i] = false;
499
500 if (dump_enabled_p ())
501 {
502 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
503 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
504 dump_printf (MSG_NOTE, "\n");
505 }
506
507 /* Fail to vectorize statements marked as unvectorizable. */
508 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
509 {
510 if (dump_enabled_p ())
511 {
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: unvectorizable statement ");
514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
515 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
516 }
517 /* Fatal mismatch. */
518 matches[0] = false;
519 return false;
520 }
521
522 lhs = gimple_get_lhs (stmt);
523 if (lhs == NULL_TREE)
524 {
525 if (dump_enabled_p ())
526 {
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "Build SLP failed: not GIMPLE_ASSIGN nor "
529 "GIMPLE_CALL ");
530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
532 }
533 /* Fatal mismatch. */
534 matches[0] = false;
535 return false;
536 }
537
538 if (is_gimple_assign (stmt)
539 && gimple_assign_rhs_code (stmt) == COND_EXPR
540 && (cond = gimple_assign_rhs1 (stmt))
541 && !COMPARISON_CLASS_P (cond))
542 {
543 if (dump_enabled_p ())
544 {
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "Build SLP failed: condition is not "
547 "comparison ");
548 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
549 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
550 }
551 /* Fatal mismatch. */
552 matches[0] = false;
553 return false;
554 }
555
556 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557 vectype = get_vectype_for_scalar_type (scalar_type);
558 if (!vectype)
559 {
560 if (dump_enabled_p ())
561 {
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: unsupported data-type ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
565 scalar_type);
566 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
567 }
568 /* Fatal mismatch. */
569 matches[0] = false;
570 return false;
571 }
572
573 /* If populating the vector type requires unrolling then fail
574 before adjusting *max_nunits for basic-block vectorization. */
575 if (bb_vinfo
576 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
577 {
578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579 "Build SLP failed: unrolling required "
580 "in basic block SLP\n");
581 /* Fatal mismatch. */
582 matches[0] = false;
583 return false;
584 }
585
586 /* In case of multiple types we need to detect the smallest type. */
587 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
588 {
589 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
590 if (bb_vinfo)
591 vectorization_factor = *max_nunits;
592 }
593
594 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
595 {
596 rhs_code = CALL_EXPR;
597 if (gimple_call_internal_p (call_stmt)
598 || gimple_call_tail_p (call_stmt)
599 || gimple_call_noreturn_p (call_stmt)
600 || !gimple_call_nothrow_p (call_stmt)
601 || gimple_call_chain (call_stmt))
602 {
603 if (dump_enabled_p ())
604 {
605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
606 "Build SLP failed: unsupported call type ");
607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
608 call_stmt, 0);
609 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
610 }
611 /* Fatal mismatch. */
612 matches[0] = false;
613 return false;
614 }
615 }
616 else
617 rhs_code = gimple_assign_rhs_code (stmt);
618
619 /* Check the operation. */
620 if (i == 0)
621 {
622 first_stmt_code = rhs_code;
623
624 /* Shift arguments should be equal in all the packed stmts for a
625 vector shift with scalar shift operand. */
626 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
627 || rhs_code == LROTATE_EXPR
628 || rhs_code == RROTATE_EXPR)
629 {
630 vec_mode = TYPE_MODE (vectype);
631
632 /* First see if we have a vector/vector shift. */
633 optab = optab_for_tree_code (rhs_code, vectype,
634 optab_vector);
635
636 if (!optab
637 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
638 {
639 /* No vector/vector shift, try for a vector/scalar shift. */
640 optab = optab_for_tree_code (rhs_code, vectype,
641 optab_scalar);
642
643 if (!optab)
644 {
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
647 "Build SLP failed: no optab.\n");
648 /* Fatal mismatch. */
649 matches[0] = false;
650 return false;
651 }
652 icode = (int) optab_handler (optab, vec_mode);
653 if (icode == CODE_FOR_nothing)
654 {
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "Build SLP failed: "
658 "op not supported by target.\n");
659 /* Fatal mismatch. */
660 matches[0] = false;
661 return false;
662 }
663 optab_op2_mode = insn_data[icode].operand[2].mode;
664 if (!VECTOR_MODE_P (optab_op2_mode))
665 {
666 need_same_oprnds = true;
667 first_op1 = gimple_assign_rhs2 (stmt);
668 }
669 }
670 }
671 else if (rhs_code == WIDEN_LSHIFT_EXPR)
672 {
673 need_same_oprnds = true;
674 first_op1 = gimple_assign_rhs2 (stmt);
675 }
676 }
677 else
678 {
679 if (first_stmt_code != rhs_code
680 && alt_stmt_code == ERROR_MARK)
681 alt_stmt_code = rhs_code;
682 if (first_stmt_code != rhs_code
683 && (first_stmt_code != IMAGPART_EXPR
684 || rhs_code != REALPART_EXPR)
685 && (first_stmt_code != REALPART_EXPR
686 || rhs_code != IMAGPART_EXPR)
687 /* Handle mismatches in plus/minus by computing both
688 and merging the results. */
689 && !((first_stmt_code == PLUS_EXPR
690 || first_stmt_code == MINUS_EXPR)
691 && (alt_stmt_code == PLUS_EXPR
692 || alt_stmt_code == MINUS_EXPR)
693 && rhs_code == alt_stmt_code)
694 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
695 && (first_stmt_code == ARRAY_REF
696 || first_stmt_code == BIT_FIELD_REF
697 || first_stmt_code == INDIRECT_REF
698 || first_stmt_code == COMPONENT_REF
699 || first_stmt_code == MEM_REF)))
700 {
701 if (dump_enabled_p ())
702 {
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operation "
705 "in stmt ");
706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "original stmt ");
709 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
710 first_stmt, 0);
711 }
712 /* Mismatch. */
713 continue;
714 }
715
716 if (need_same_oprnds
717 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
718 {
719 if (dump_enabled_p ())
720 {
721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
722 "Build SLP failed: different shift "
723 "arguments in ");
724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
725 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
726 }
727 /* Mismatch. */
728 continue;
729 }
730
731 if (rhs_code == CALL_EXPR)
732 {
733 gimple first_stmt = stmts[0];
734 if (gimple_call_num_args (stmt) != nops
735 || !operand_equal_p (gimple_call_fn (first_stmt),
736 gimple_call_fn (stmt), 0)
737 || gimple_call_fntype (first_stmt)
738 != gimple_call_fntype (stmt))
739 {
740 if (dump_enabled_p ())
741 {
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different calls in ");
744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
745 stmt, 0);
746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
747 }
748 /* Mismatch. */
749 continue;
750 }
751 }
752 }
753
754 /* Grouped store or load. */
755 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
756 {
757 if (REFERENCE_CLASS_P (lhs))
758 {
759 /* Store. */
760 ;
761 }
762 else
763 {
764 /* Load. */
765 /* Check that the size of interleaved loads group is not
766 greater than the SLP group size. */
767 unsigned ncopies
768 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
769 if (loop_vinfo
770 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
771 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
772 - GROUP_GAP (vinfo_for_stmt (stmt)))
773 > ncopies * group_size))
774 {
775 if (dump_enabled_p ())
776 {
777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
778 "Build SLP failed: the number "
779 "of interleaved loads is greater than "
780 "the SLP group size ");
781 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
782 stmt, 0);
783 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
784 }
785 /* Fatal mismatch. */
786 matches[0] = false;
787 return false;
788 }
789
790 old_first_load = first_load;
791 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
792 if (prev_first_load)
793 {
794 /* Check that there are no loads from different interleaving
795 chains in the same node. */
796 if (prev_first_load != first_load)
797 {
798 if (dump_enabled_p ())
799 {
800 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
801 vect_location,
802 "Build SLP failed: different "
803 "interleaving chains in one node ");
804 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
805 stmt, 0);
806 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
807 }
808 /* Mismatch. */
809 continue;
810 }
811 }
812 else
813 prev_first_load = first_load;
814
815 /* In some cases a group of loads is just the same load
816 repeated N times. Only analyze its cost once. */
817 if (first_load == stmt && old_first_load != first_load)
818 {
819 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
820 if (vect_supportable_dr_alignment (first_dr, false)
821 == dr_unaligned_unsupported)
822 {
823 if (dump_enabled_p ())
824 {
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
826 vect_location,
827 "Build SLP failed: unsupported "
828 "unaligned load ");
829 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
830 stmt, 0);
831 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
832 }
833 /* Fatal mismatch. */
834 matches[0] = false;
835 return false;
836 }
837 }
838 }
839 } /* Grouped access. */
840 else
841 {
842 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
843 {
844 /* Not grouped load. */
845 if (dump_enabled_p ())
846 {
847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
848 "Build SLP failed: not grouped load ");
849 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
850 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
851 }
852
853 /* FORNOW: Not grouped loads are not supported. */
854 /* Fatal mismatch. */
855 matches[0] = false;
856 return false;
857 }
858
859 /* Not memory operation. */
860 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
861 && TREE_CODE_CLASS (rhs_code) != tcc_unary
862 && TREE_CODE_CLASS (rhs_code) != tcc_expression
863 && rhs_code != CALL_EXPR)
864 {
865 if (dump_enabled_p ())
866 {
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: operation");
869 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
870 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
871 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
872 }
873 /* Fatal mismatch. */
874 matches[0] = false;
875 return false;
876 }
877
878 if (rhs_code == COND_EXPR)
879 {
880 tree cond_expr = gimple_assign_rhs1 (stmt);
881
882 if (i == 0)
883 first_cond_code = TREE_CODE (cond_expr);
884 else if (first_cond_code != TREE_CODE (cond_expr))
885 {
886 if (dump_enabled_p ())
887 {
888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 "Build SLP failed: different"
890 " operation");
891 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
892 stmt, 0);
893 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
894 }
895 /* Mismatch. */
896 continue;
897 }
898 }
899 }
900
901 matches[i] = true;
902 }
903
904 for (i = 0; i < group_size; ++i)
905 if (!matches[i])
906 return false;
907
908 /* If we allowed a two-operation SLP node verify the target can cope
909 with the permute we are going to use. */
910 if (alt_stmt_code != ERROR_MARK
911 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
912 {
913 unsigned char *sel
914 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
915 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
916 {
917 sel[i] = i;
918 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
919 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
920 }
921 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
922 {
923 for (i = 0; i < group_size; ++i)
924 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
925 {
926 matches[i] = false;
927 if (dump_enabled_p ())
928 {
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "Build SLP failed: different operation "
931 "in stmt ");
932 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
933 stmts[i], 0);
934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
935 "original stmt ");
936 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
937 first_stmt, 0);
938 }
939 }
940 return false;
941 }
942 *two_operators = true;
943 }
944
945 return true;
946 }
947
948 /* Recursively build an SLP tree starting from NODE.
949 Fail (and return a value not equal to zero) if def-stmts are not
950 isomorphic, require data permutation or are of unsupported types of
951 operation. Otherwise, return 0.
952 The value returned is the depth in the SLP tree where a mismatch
953 was found. */
954
955 static bool
956 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
957 slp_tree *node, unsigned int group_size,
958 unsigned int *max_nunits,
959 vec<slp_tree> *loads,
960 unsigned int vectorization_factor,
961 bool *matches, unsigned *npermutes, unsigned *tree_size,
962 unsigned max_tree_size)
963 {
964 unsigned nops, i, this_tree_size = 0;
965 gimple stmt;
966
967 matches[0] = false;
968
969 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
970 if (is_gimple_call (stmt))
971 nops = gimple_call_num_args (stmt);
972 else if (is_gimple_assign (stmt))
973 {
974 nops = gimple_num_ops (stmt) - 1;
975 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
976 nops++;
977 }
978 else
979 return false;
980
981 bool two_operators = false;
982 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
983 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
984 max_nunits, vectorization_factor, matches,
985 &two_operators))
986 return false;
987 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
988
989 /* If the SLP node is a load, terminate the recursion. */
990 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
991 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
992 {
993 loads->safe_push (*node);
994 return true;
995 }
996
997 /* Get at the operands, verifying they are compatible. */
998 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
999 slp_oprnd_info oprnd_info;
1000 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1001 {
1002 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1003 stmt, i, &oprnds_info))
1004 {
1005 case 0:
1006 break;
1007 case -1:
1008 matches[0] = false;
1009 vect_free_oprnd_info (oprnds_info);
1010 return false;
1011 case 1:
1012 matches[i] = false;
1013 break;
1014 }
1015 }
1016 for (i = 0; i < group_size; ++i)
1017 if (!matches[i])
1018 {
1019 vect_free_oprnd_info (oprnds_info);
1020 return false;
1021 }
1022
1023 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1024
1025 /* Create SLP_TREE nodes for the definition node/s. */
1026 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1027 {
1028 slp_tree child;
1029 unsigned old_nloads = loads->length ();
1030 unsigned old_max_nunits = *max_nunits;
1031
1032 if (oprnd_info->first_dt != vect_internal_def)
1033 continue;
1034
1035 if (++this_tree_size > max_tree_size)
1036 {
1037 vect_free_oprnd_info (oprnds_info);
1038 return false;
1039 }
1040
1041 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1042 if (!child)
1043 {
1044 vect_free_oprnd_info (oprnds_info);
1045 return false;
1046 }
1047
1048 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1049 group_size, max_nunits, loads,
1050 vectorization_factor, matches,
1051 npermutes, &this_tree_size, max_tree_size))
1052 {
1053 /* If we have all children of child built up from scalars then just
1054 throw that away and build it up this node from scalars. */
1055 if (!SLP_TREE_CHILDREN (child).is_empty ())
1056 {
1057 unsigned int j;
1058 slp_tree grandchild;
1059
1060 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1061 if (grandchild != NULL)
1062 break;
1063 if (!grandchild)
1064 {
1065 /* Roll back. */
1066 *max_nunits = old_max_nunits;
1067 loads->truncate (old_nloads);
1068 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1069 vect_free_slp_tree (grandchild);
1070 SLP_TREE_CHILDREN (child).truncate (0);
1071
1072 dump_printf_loc (MSG_NOTE, vect_location,
1073 "Building parent vector operands from "
1074 "scalars instead\n");
1075 oprnd_info->def_stmts = vNULL;
1076 vect_free_slp_tree (child);
1077 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1078 continue;
1079 }
1080 }
1081
1082 oprnd_info->def_stmts = vNULL;
1083 SLP_TREE_CHILDREN (*node).quick_push (child);
1084 continue;
1085 }
1086
1087 /* If the SLP build failed fatally and we analyze a basic-block
1088 simply treat nodes we fail to build as externally defined
1089 (and thus build vectors from the scalar defs).
1090 The cost model will reject outright expensive cases.
1091 ??? This doesn't treat cases where permutation ultimatively
1092 fails (or we don't try permutation below). Ideally we'd
1093 even compute a permutation that will end up with the maximum
1094 SLP tree size... */
1095 if (bb_vinfo
1096 && !matches[0]
1097 /* ??? Rejecting patterns this way doesn't work. We'd have to
1098 do extra work to cancel the pattern so the uses see the
1099 scalar version. */
1100 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1101 {
1102 unsigned int j;
1103 slp_tree grandchild;
1104
1105 /* Roll back. */
1106 *max_nunits = old_max_nunits;
1107 loads->truncate (old_nloads);
1108 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1109 vect_free_slp_tree (grandchild);
1110 SLP_TREE_CHILDREN (child).truncate (0);
1111
1112 dump_printf_loc (MSG_NOTE, vect_location,
1113 "Building vector operands from scalars\n");
1114 oprnd_info->def_stmts = vNULL;
1115 vect_free_slp_tree (child);
1116 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1117 continue;
1118 }
1119
1120 /* If the SLP build for operand zero failed and operand zero
1121 and one can be commutated try that for the scalar stmts
1122 that failed the match. */
1123 if (i == 0
1124 /* A first scalar stmt mismatch signals a fatal mismatch. */
1125 && matches[0]
1126 /* ??? For COND_EXPRs we can swap the comparison operands
1127 as well as the arms under some constraints. */
1128 && nops == 2
1129 && oprnds_info[1]->first_dt == vect_internal_def
1130 && is_gimple_assign (stmt)
1131 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1132 && !SLP_TREE_TWO_OPERATORS (*node)
1133 /* Do so only if the number of not successful permutes was nor more
1134 than a cut-ff as re-trying the recursive match on
1135 possibly each level of the tree would expose exponential
1136 behavior. */
1137 && *npermutes < 4)
1138 {
1139 unsigned int j;
1140 slp_tree grandchild;
1141
1142 /* Roll back. */
1143 *max_nunits = old_max_nunits;
1144 loads->truncate (old_nloads);
1145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1146 vect_free_slp_tree (grandchild);
1147 SLP_TREE_CHILDREN (child).truncate (0);
1148
1149 /* Swap mismatched definition stmts. */
1150 dump_printf_loc (MSG_NOTE, vect_location,
1151 "Re-trying with swapped operands of stmts ");
1152 for (j = 0; j < group_size; ++j)
1153 if (!matches[j])
1154 {
1155 gimple tem = oprnds_info[0]->def_stmts[j];
1156 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1157 oprnds_info[1]->def_stmts[j] = tem;
1158 dump_printf (MSG_NOTE, "%d ", j);
1159 }
1160 dump_printf (MSG_NOTE, "\n");
1161 /* And try again with scratch 'matches' ... */
1162 bool *tem = XALLOCAVEC (bool, group_size);
1163 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1164 group_size, max_nunits, loads,
1165 vectorization_factor,
1166 tem, npermutes, &this_tree_size,
1167 max_tree_size))
1168 {
1169 /* ... so if successful we can apply the operand swapping
1170 to the GIMPLE IL. This is necessary because for example
1171 vect_get_slp_defs uses operand indexes and thus expects
1172 canonical operand order. */
1173 for (j = 0; j < group_size; ++j)
1174 if (!matches[j])
1175 {
1176 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1177 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1178 gimple_assign_rhs2_ptr (stmt));
1179 }
1180 oprnd_info->def_stmts = vNULL;
1181 SLP_TREE_CHILDREN (*node).quick_push (child);
1182 continue;
1183 }
1184
1185 ++*npermutes;
1186 }
1187
1188 oprnd_info->def_stmts = vNULL;
1189 vect_free_slp_tree (child);
1190 vect_free_oprnd_info (oprnds_info);
1191 return false;
1192 }
1193
1194 if (tree_size)
1195 *tree_size += this_tree_size;
1196
1197 vect_free_oprnd_info (oprnds_info);
1198 return true;
1199 }
1200
1201 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1202
1203 static void
1204 vect_print_slp_tree (int dump_kind, slp_tree node)
1205 {
1206 int i;
1207 gimple stmt;
1208 slp_tree child;
1209
1210 if (!node)
1211 return;
1212
1213 dump_printf (dump_kind, "node ");
1214 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1215 {
1216 dump_printf (dump_kind, "\n\tstmt %d ", i);
1217 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1218 }
1219 dump_printf (dump_kind, "\n");
1220
1221 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1222 vect_print_slp_tree (dump_kind, child);
1223 }
1224
1225
1226 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1227 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1228 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1229 stmts in NODE are to be marked. */
1230
1231 static void
1232 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1233 {
1234 int i;
1235 gimple stmt;
1236 slp_tree child;
1237
1238 if (!node)
1239 return;
1240
1241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1242 if (j < 0 || i == j)
1243 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1244
1245 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1246 vect_mark_slp_stmts (child, mark, j);
1247 }
1248
1249
1250 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1251
1252 static void
1253 vect_mark_slp_stmts_relevant (slp_tree node)
1254 {
1255 int i;
1256 gimple stmt;
1257 stmt_vec_info stmt_info;
1258 slp_tree child;
1259
1260 if (!node)
1261 return;
1262
1263 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1264 {
1265 stmt_info = vinfo_for_stmt (stmt);
1266 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1267 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1268 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1269 }
1270
1271 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1272 vect_mark_slp_stmts_relevant (child);
1273 }
1274
1275
1276 /* Rearrange the statements of NODE according to PERMUTATION. */
1277
1278 static void
1279 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1280 vec<unsigned> permutation)
1281 {
1282 gimple stmt;
1283 vec<gimple> tmp_stmts;
1284 unsigned int i;
1285 slp_tree child;
1286
1287 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1288 vect_slp_rearrange_stmts (child, group_size, permutation);
1289
1290 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1291 tmp_stmts.create (group_size);
1292 tmp_stmts.quick_grow_cleared (group_size);
1293
1294 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1295 tmp_stmts[permutation[i]] = stmt;
1296
1297 SLP_TREE_SCALAR_STMTS (node).release ();
1298 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1299 }
1300
1301
1302 /* Check if the required load permutations in the SLP instance
1303 SLP_INSTN are supported. */
1304
1305 static bool
1306 vect_supported_load_permutation_p (slp_instance slp_instn)
1307 {
1308 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1309 unsigned int i, j, k, next;
1310 sbitmap load_index;
1311 slp_tree node;
1312 gimple stmt, load, next_load, first_load;
1313 struct data_reference *dr;
1314
1315 if (dump_enabled_p ())
1316 {
1317 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1318 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1319 if (node->load_permutation.exists ())
1320 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1321 dump_printf (MSG_NOTE, "%d ", next);
1322 else
1323 for (k = 0; k < group_size; ++k)
1324 dump_printf (MSG_NOTE, "%d ", k);
1325 dump_printf (MSG_NOTE, "\n");
1326 }
1327
1328 /* In case of reduction every load permutation is allowed, since the order
1329 of the reduction statements is not important (as opposed to the case of
1330 grouped stores). The only condition we need to check is that all the
1331 load nodes are of the same size and have the same permutation (and then
1332 rearrange all the nodes of the SLP instance according to this
1333 permutation). */
1334
1335 /* Check that all the load nodes are of the same size. */
1336 /* ??? Can't we assert this? */
1337 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1338 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1339 return false;
1340
1341 node = SLP_INSTANCE_TREE (slp_instn);
1342 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1343
1344 /* Reduction (there are no data-refs in the root).
1345 In reduction chain the order of the loads is important. */
1346 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1347 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1348 {
1349 slp_tree load;
1350 unsigned int lidx;
1351
1352 /* Compare all the permutation sequences to the first one. We know
1353 that at least one load is permuted. */
1354 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1355 if (!node->load_permutation.exists ())
1356 return false;
1357 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1358 {
1359 if (!load->load_permutation.exists ())
1360 return false;
1361 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1362 if (lidx != node->load_permutation[j])
1363 return false;
1364 }
1365
1366 /* Check that the loads in the first sequence are different and there
1367 are no gaps between them. */
1368 load_index = sbitmap_alloc (group_size);
1369 bitmap_clear (load_index);
1370 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1371 {
1372 if (bitmap_bit_p (load_index, lidx))
1373 {
1374 sbitmap_free (load_index);
1375 return false;
1376 }
1377 bitmap_set_bit (load_index, lidx);
1378 }
1379 for (i = 0; i < group_size; i++)
1380 if (!bitmap_bit_p (load_index, i))
1381 {
1382 sbitmap_free (load_index);
1383 return false;
1384 }
1385 sbitmap_free (load_index);
1386
1387 /* This permutation is valid for reduction. Since the order of the
1388 statements in the nodes is not important unless they are memory
1389 accesses, we can rearrange the statements in all the nodes
1390 according to the order of the loads. */
1391 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1392 node->load_permutation);
1393
1394 /* We are done, no actual permutations need to be generated. */
1395 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1396 SLP_TREE_LOAD_PERMUTATION (node).release ();
1397 return true;
1398 }
1399
1400 /* In basic block vectorization we allow any subchain of an interleaving
1401 chain.
1402 FORNOW: not supported in loop SLP because of realignment compications. */
1403 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1404 {
1405 /* Check whether the loads in an instance form a subchain and thus
1406 no permutation is necessary. */
1407 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1408 {
1409 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1410 continue;
1411 bool subchain_p = true;
1412 next_load = NULL;
1413 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1414 {
1415 if (j != 0
1416 && (next_load != load
1417 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1418 {
1419 subchain_p = false;
1420 break;
1421 }
1422 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1423 }
1424 if (subchain_p)
1425 SLP_TREE_LOAD_PERMUTATION (node).release ();
1426 else
1427 {
1428 /* Verify the permutation can be generated. */
1429 vec<tree> tem;
1430 if (!vect_transform_slp_perm_load (node, tem, NULL,
1431 1, slp_instn, true))
1432 {
1433 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1434 vect_location,
1435 "unsupported load permutation\n");
1436 return false;
1437 }
1438 }
1439 }
1440
1441 /* Check that the alignment of the first load in every subchain, i.e.,
1442 the first statement in every load node, is supported.
1443 ??? This belongs in alignment checking. */
1444 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1445 {
1446 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1447 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1448 {
1449 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1450 if (vect_supportable_dr_alignment (dr, false)
1451 == dr_unaligned_unsupported)
1452 {
1453 if (dump_enabled_p ())
1454 {
1455 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1456 vect_location,
1457 "unsupported unaligned load ");
1458 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1459 first_load, 0);
1460 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1461 }
1462 return false;
1463 }
1464 }
1465 }
1466
1467 return true;
1468 }
1469
1470 /* For loop vectorization verify we can generate the permutation. */
1471 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1472 if (node->load_permutation.exists ()
1473 && !vect_transform_slp_perm_load
1474 (node, vNULL, NULL,
1475 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1476 return false;
1477
1478 return true;
1479 }
1480
1481
1482 /* Find the last store in SLP INSTANCE. */
1483
1484 static gimple
1485 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1486 {
1487 gimple last = NULL, stmt;
1488
1489 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1490 {
1491 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1492 if (is_pattern_stmt_p (stmt_vinfo))
1493 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1494 else
1495 last = get_later_stmt (stmt, last);
1496 }
1497
1498 return last;
1499 }
1500
1501 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1502
1503 static void
1504 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1505 stmt_vector_for_cost *prologue_cost_vec,
1506 stmt_vector_for_cost *body_cost_vec,
1507 unsigned ncopies_for_cost)
1508 {
1509 unsigned i;
1510 slp_tree child;
1511 gimple stmt, s;
1512 stmt_vec_info stmt_info;
1513 tree lhs;
1514 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1515
1516 /* Recurse down the SLP tree. */
1517 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1518 if (child)
1519 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1520 body_cost_vec, ncopies_for_cost);
1521
1522 /* Look at the first scalar stmt to determine the cost. */
1523 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1524 stmt_info = vinfo_for_stmt (stmt);
1525 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1526 {
1527 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1528 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1529 vect_uninitialized_def,
1530 node, prologue_cost_vec, body_cost_vec);
1531 else
1532 {
1533 int i;
1534 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1535 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1536 node, prologue_cost_vec, body_cost_vec);
1537 /* If the load is permuted record the cost for the permutation.
1538 ??? Loads from multiple chains are let through here only
1539 for a single special case involving complex numbers where
1540 in the end no permutation is necessary. */
1541 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1542 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1543 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1544 && vect_get_place_in_interleaving_chain
1545 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1546 {
1547 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1548 stmt_info, 0, vect_body);
1549 break;
1550 }
1551 }
1552 }
1553 else
1554 {
1555 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1556 stmt_info, 0, vect_body);
1557 if (SLP_TREE_TWO_OPERATORS (node))
1558 {
1559 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1560 stmt_info, 0, vect_body);
1561 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1562 stmt_info, 0, vect_body);
1563 }
1564 }
1565
1566 /* Scan operands and account for prologue cost of constants/externals.
1567 ??? This over-estimates cost for multiple uses and should be
1568 re-engineered. */
1569 lhs = gimple_get_lhs (stmt);
1570 for (i = 0; i < gimple_num_ops (stmt); ++i)
1571 {
1572 tree def, op = gimple_op (stmt, i);
1573 gimple def_stmt;
1574 enum vect_def_type dt;
1575 if (!op || op == lhs)
1576 continue;
1577 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1578 STMT_VINFO_BB_VINFO (stmt_info),
1579 &def_stmt, &def, &dt))
1580 {
1581 /* Without looking at the actual initializer a vector of
1582 constants can be implemented as load from the constant pool.
1583 ??? We need to pass down stmt_info for a vector type
1584 even if it points to the wrong stmt. */
1585 if (dt == vect_constant_def)
1586 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1587 stmt_info, 0, vect_prologue);
1588 else if (dt == vect_external_def)
1589 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1590 stmt_info, 0, vect_prologue);
1591 }
1592 }
1593 }
1594
1595 /* Compute the cost for the SLP instance INSTANCE. */
1596
1597 static void
1598 vect_analyze_slp_cost (slp_instance instance, void *data)
1599 {
1600 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1601 unsigned ncopies_for_cost;
1602 stmt_info_for_cost *si;
1603 unsigned i;
1604
1605 /* Calculate the number of vector stmts to create based on the unrolling
1606 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1607 GROUP_SIZE / NUNITS otherwise. */
1608 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1609 slp_tree node = SLP_INSTANCE_TREE (instance);
1610 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1611 /* Adjust the group_size by the vectorization factor which is always one
1612 for basic-block vectorization. */
1613 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1614 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1615 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1616 /* For reductions look at a reduction operand in case the reduction
1617 operation is widening like DOT_PROD or SAD. */
1618 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1619 {
1620 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1621 switch (gimple_assign_rhs_code (stmt))
1622 {
1623 case DOT_PROD_EXPR:
1624 case SAD_EXPR:
1625 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1626 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1627 break;
1628 default:;
1629 }
1630 }
1631 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1632
1633 prologue_cost_vec.create (10);
1634 body_cost_vec.create (10);
1635 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1636 &prologue_cost_vec, &body_cost_vec,
1637 ncopies_for_cost);
1638
1639 /* Record the prologue costs, which were delayed until we were
1640 sure that SLP was successful. */
1641 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1642 {
1643 struct _stmt_vec_info *stmt_info
1644 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1645 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1646 si->misalign, vect_prologue);
1647 }
1648
1649 /* Record the instance's instructions in the target cost model. */
1650 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1651 {
1652 struct _stmt_vec_info *stmt_info
1653 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1654 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1655 si->misalign, vect_body);
1656 }
1657
1658 prologue_cost_vec.release ();
1659 body_cost_vec.release ();
1660 }
1661
1662 /* Analyze an SLP instance starting from a group of grouped stores. Call
1663 vect_build_slp_tree to build a tree of packed stmts if possible.
1664 Return FALSE if it's impossible to SLP any stmt in the loop. */
1665
1666 static bool
1667 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1668 gimple stmt, unsigned max_tree_size)
1669 {
1670 slp_instance new_instance;
1671 slp_tree node;
1672 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1673 unsigned int unrolling_factor = 1, nunits;
1674 tree vectype, scalar_type = NULL_TREE;
1675 gimple next;
1676 unsigned int vectorization_factor = 0;
1677 int i;
1678 unsigned int max_nunits = 0;
1679 vec<slp_tree> loads;
1680 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1681 vec<gimple> scalar_stmts;
1682
1683 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1684 {
1685 if (dr)
1686 {
1687 scalar_type = TREE_TYPE (DR_REF (dr));
1688 vectype = get_vectype_for_scalar_type (scalar_type);
1689 }
1690 else
1691 {
1692 gcc_assert (loop_vinfo);
1693 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1694 }
1695
1696 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1697 }
1698 else
1699 {
1700 gcc_assert (loop_vinfo);
1701 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1702 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1703 }
1704
1705 if (!vectype)
1706 {
1707 if (dump_enabled_p ())
1708 {
1709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 "Build SLP failed: unsupported data-type ");
1711 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1712 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1713 }
1714
1715 return false;
1716 }
1717
1718 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1719 if (loop_vinfo)
1720 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1721 else
1722 vectorization_factor = nunits;
1723
1724 /* Calculate the unrolling factor. */
1725 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1726 if (unrolling_factor != 1 && !loop_vinfo)
1727 {
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "Build SLP failed: unrolling required in basic"
1731 " block SLP\n");
1732
1733 return false;
1734 }
1735
1736 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1737 scalar_stmts.create (group_size);
1738 next = stmt;
1739 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1740 {
1741 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1742 while (next)
1743 {
1744 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1745 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1746 scalar_stmts.safe_push (
1747 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1748 else
1749 scalar_stmts.safe_push (next);
1750 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1751 }
1752 /* Mark the first element of the reduction chain as reduction to properly
1753 transform the node. In the reduction analysis phase only the last
1754 element of the chain is marked as reduction. */
1755 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1756 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1757 }
1758 else
1759 {
1760 /* Collect reduction statements. */
1761 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1762 for (i = 0; reductions.iterate (i, &next); i++)
1763 scalar_stmts.safe_push (next);
1764 }
1765
1766 node = vect_create_new_slp_node (scalar_stmts);
1767
1768 loads.create (group_size);
1769
1770 /* Build the tree for the SLP instance. */
1771 bool *matches = XALLOCAVEC (bool, group_size);
1772 unsigned npermutes = 0;
1773 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1774 &max_nunits, &loads,
1775 vectorization_factor, matches, &npermutes, NULL,
1776 max_tree_size))
1777 {
1778 /* Calculate the unrolling factor based on the smallest type. */
1779 if (max_nunits > nunits)
1780 unrolling_factor = least_common_multiple (max_nunits, group_size)
1781 / group_size;
1782
1783 if (unrolling_factor != 1 && !loop_vinfo)
1784 {
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1787 "Build SLP failed: unrolling required in basic"
1788 " block SLP\n");
1789 vect_free_slp_tree (node);
1790 loads.release ();
1791 return false;
1792 }
1793
1794 /* Create a new SLP instance. */
1795 new_instance = XNEW (struct _slp_instance);
1796 SLP_INSTANCE_TREE (new_instance) = node;
1797 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1798 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1799 SLP_INSTANCE_LOADS (new_instance) = loads;
1800
1801 /* Compute the load permutation. */
1802 slp_tree load_node;
1803 bool loads_permuted = false;
1804 FOR_EACH_VEC_ELT (loads, i, load_node)
1805 {
1806 vec<unsigned> load_permutation;
1807 int j;
1808 gimple load, first_stmt;
1809 bool this_load_permuted = false;
1810 load_permutation.create (group_size);
1811 first_stmt = GROUP_FIRST_ELEMENT
1812 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1813 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1814 {
1815 int load_place
1816 = vect_get_place_in_interleaving_chain (load, first_stmt);
1817 gcc_assert (load_place != -1);
1818 if (load_place != j)
1819 this_load_permuted = true;
1820 load_permutation.safe_push (load_place);
1821 }
1822 if (!this_load_permuted
1823 /* The load requires permutation when unrolling exposes
1824 a gap either because the group is larger than the SLP
1825 group-size or because there is a gap between the groups. */
1826 && (unrolling_factor == 1
1827 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1828 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1829 {
1830 load_permutation.release ();
1831 continue;
1832 }
1833 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1834 loads_permuted = true;
1835 }
1836
1837 if (loads_permuted)
1838 {
1839 if (!vect_supported_load_permutation_p (new_instance))
1840 {
1841 if (dump_enabled_p ())
1842 {
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "Build SLP failed: unsupported load "
1845 "permutation ");
1846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1847 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1848 }
1849 vect_free_slp_instance (new_instance);
1850 return false;
1851 }
1852 }
1853
1854
1855 if (loop_vinfo)
1856 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1857 else
1858 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1859
1860 if (dump_enabled_p ())
1861 vect_print_slp_tree (MSG_NOTE, node);
1862
1863 return true;
1864 }
1865
1866 /* Failed to SLP. */
1867 /* Free the allocated memory. */
1868 vect_free_slp_tree (node);
1869 loads.release ();
1870
1871 return false;
1872 }
1873
1874
1875 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1876 trees of packed scalar stmts if SLP is possible. */
1877
1878 bool
1879 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1880 unsigned max_tree_size)
1881 {
1882 unsigned int i;
1883 vec<gimple> grouped_stores;
1884 vec<gimple> reductions = vNULL;
1885 vec<gimple> reduc_chains = vNULL;
1886 gimple first_element;
1887 bool ok = false;
1888
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1891
1892 if (loop_vinfo)
1893 {
1894 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1895 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1896 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1897 }
1898 else
1899 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1900
1901 /* Find SLP sequences starting from groups of grouped stores. */
1902 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1903 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1904 max_tree_size))
1905 ok = true;
1906
1907 if (reduc_chains.length () > 0)
1908 {
1909 /* Find SLP sequences starting from reduction chains. */
1910 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1911 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1912 max_tree_size))
1913 ok = true;
1914 else
1915 return false;
1916
1917 /* Don't try to vectorize SLP reductions if reduction chain was
1918 detected. */
1919 return ok;
1920 }
1921
1922 /* Find SLP sequences starting from groups of reductions. */
1923 if (reductions.length () > 1
1924 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1925 max_tree_size))
1926 ok = true;
1927
1928 return true;
1929 }
1930
1931
1932 /* For each possible SLP instance decide whether to SLP it and calculate overall
1933 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1934 least one instance. */
1935
1936 bool
1937 vect_make_slp_decision (loop_vec_info loop_vinfo)
1938 {
1939 unsigned int i, unrolling_factor = 1;
1940 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1941 slp_instance instance;
1942 int decided_to_slp = 0;
1943
1944 if (dump_enabled_p ())
1945 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1946 "\n");
1947
1948 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1949 {
1950 /* FORNOW: SLP if you can. */
1951 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1952 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1953
1954 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1955 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1956 loop-based vectorization. Such stmts will be marked as HYBRID. */
1957 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1958 decided_to_slp++;
1959 }
1960
1961 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1962
1963 if (decided_to_slp && dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE, vect_location,
1965 "Decided to SLP %d instances. Unrolling factor %d\n",
1966 decided_to_slp, unrolling_factor);
1967
1968 return (decided_to_slp > 0);
1969 }
1970
1971
1972 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1973 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1974
1975 static void
1976 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1977 {
1978 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1979 imm_use_iterator imm_iter;
1980 gimple use_stmt;
1981 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1982 slp_tree child;
1983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1985 int j;
1986
1987 /* Propagate hybrid down the SLP tree. */
1988 if (stype == hybrid)
1989 ;
1990 else if (HYBRID_SLP_STMT (stmt_vinfo))
1991 stype = hybrid;
1992 else
1993 {
1994 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1995 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1996 /* We always get the pattern stmt here, but for immediate
1997 uses we have to use the LHS of the original stmt. */
1998 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1999 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
2000 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2001 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2002 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2003 {
2004 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2005 continue;
2006 use_vinfo = vinfo_for_stmt (use_stmt);
2007 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2008 && STMT_VINFO_RELATED_STMT (use_vinfo))
2009 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2010 if (!STMT_SLP_TYPE (use_vinfo)
2011 && (STMT_VINFO_RELEVANT (use_vinfo)
2012 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2013 && !(gimple_code (use_stmt) == GIMPLE_PHI
2014 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2015 stype = hybrid;
2016 }
2017 }
2018
2019 if (stype == hybrid)
2020 {
2021 if (dump_enabled_p ())
2022 {
2023 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2024 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2025 }
2026 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2027 }
2028
2029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2030 if (child)
2031 vect_detect_hybrid_slp_stmts (child, i, stype);
2032 }
2033
2034 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2035
2036 static tree
2037 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2038 {
2039 walk_stmt_info *wi = (walk_stmt_info *)data;
2040 struct loop *loopp = (struct loop *)wi->info;
2041
2042 if (wi->is_lhs)
2043 return NULL_TREE;
2044
2045 if (TREE_CODE (*tp) == SSA_NAME
2046 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2047 {
2048 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2049 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2050 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2051 {
2052 if (dump_enabled_p ())
2053 {
2054 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2055 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2056 }
2057 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2058 }
2059 }
2060
2061 return NULL_TREE;
2062 }
2063
2064 static tree
2065 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2066 walk_stmt_info *)
2067 {
2068 /* If the stmt is in a SLP instance then this isn't a reason
2069 to mark use definitions in other SLP instances as hybrid. */
2070 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2071 *handled = true;
2072 return NULL_TREE;
2073 }
2074
2075 /* Find stmts that must be both vectorized and SLPed. */
2076
2077 void
2078 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2079 {
2080 unsigned int i;
2081 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2082 slp_instance instance;
2083
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2086 "\n");
2087
2088 /* First walk all pattern stmt in the loop and mark defs of uses as
2089 hybrid because immediate uses in them are not recorded. */
2090 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2091 {
2092 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2093 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2094 gsi_next (&gsi))
2095 {
2096 gimple stmt = gsi_stmt (gsi);
2097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2098 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2099 {
2100 walk_stmt_info wi;
2101 memset (&wi, 0, sizeof (wi));
2102 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2103 gimple_stmt_iterator gsi2
2104 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2105 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2106 vect_detect_hybrid_slp_1, &wi);
2107 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2108 vect_detect_hybrid_slp_2,
2109 vect_detect_hybrid_slp_1, &wi);
2110 }
2111 }
2112 }
2113
2114 /* Then walk the SLP instance trees marking stmts with uses in
2115 non-SLP stmts as hybrid, also propagating hybrid down the
2116 SLP tree, collecting the above info on-the-fly. */
2117 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2118 {
2119 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2120 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2121 i, pure_slp);
2122 }
2123 }
2124
2125
2126 /* Create and initialize a new bb_vec_info struct for BB, as well as
2127 stmt_vec_info structs for all the stmts in it. */
2128
2129 static bb_vec_info
2130 new_bb_vec_info (basic_block bb)
2131 {
2132 bb_vec_info res = NULL;
2133 gimple_stmt_iterator gsi;
2134
2135 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2136 BB_VINFO_BB (res) = bb;
2137
2138 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2139 {
2140 gimple stmt = gsi_stmt (gsi);
2141 gimple_set_uid (stmt, 0);
2142 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2143 }
2144
2145 BB_VINFO_GROUPED_STORES (res).create (10);
2146 BB_VINFO_SLP_INSTANCES (res).create (2);
2147 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2148
2149 bb->aux = res;
2150 return res;
2151 }
2152
2153
2154 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2155 stmts in the basic block. */
2156
2157 static void
2158 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2159 {
2160 vec<slp_instance> slp_instances;
2161 slp_instance instance;
2162 basic_block bb;
2163 gimple_stmt_iterator si;
2164 unsigned i;
2165
2166 if (!bb_vinfo)
2167 return;
2168
2169 bb = BB_VINFO_BB (bb_vinfo);
2170
2171 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2172 {
2173 gimple stmt = gsi_stmt (si);
2174 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2175
2176 if (stmt_info)
2177 /* Free stmt_vec_info. */
2178 free_stmt_vec_info (stmt);
2179 }
2180
2181 vect_destroy_datarefs (NULL, bb_vinfo);
2182 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2183 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2184 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2185 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2186 vect_free_slp_instance (instance);
2187 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2188 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2189 free (bb_vinfo);
2190 bb->aux = NULL;
2191 }
2192
2193
2194 /* Analyze statements contained in SLP tree node after recursively analyzing
2195 the subtree. Return TRUE if the operations are supported. */
2196
2197 static bool
2198 vect_slp_analyze_node_operations (slp_tree node)
2199 {
2200 bool dummy;
2201 int i;
2202 gimple stmt;
2203 slp_tree child;
2204
2205 if (!node)
2206 return true;
2207
2208 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2209 if (!vect_slp_analyze_node_operations (child))
2210 return false;
2211
2212 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2213 {
2214 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2215 gcc_assert (stmt_info);
2216 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2217
2218 if (!vect_analyze_stmt (stmt, &dummy, node))
2219 return false;
2220 }
2221
2222 return true;
2223 }
2224
2225
2226 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2227 operations are supported. */
2228
2229 bool
2230 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2231 {
2232 slp_instance instance;
2233 int i;
2234
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE, vect_location,
2237 "=== vect_slp_analyze_operations ===\n");
2238
2239 for (i = 0; slp_instances.iterate (i, &instance); )
2240 {
2241 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2242 {
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 "removing SLP instance operations starting from: ");
2245 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2246 SLP_TREE_SCALAR_STMTS
2247 (SLP_INSTANCE_TREE (instance))[0], 0);
2248 vect_free_slp_instance (instance);
2249 slp_instances.ordered_remove (i);
2250 }
2251 else
2252 {
2253 /* Compute the costs of the SLP instance. */
2254 vect_analyze_slp_cost (instance, data);
2255 i++;
2256 }
2257 }
2258
2259 if (!slp_instances.length ())
2260 return false;
2261
2262 return true;
2263 }
2264
2265
2266 /* Compute the scalar cost of the SLP node NODE and its children
2267 and return it. Do not account defs that are marked in LIFE and
2268 update LIFE according to uses of NODE. */
2269
2270 static unsigned
2271 vect_bb_slp_scalar_cost (basic_block bb,
2272 slp_tree node, vec<bool, va_heap> *life)
2273 {
2274 unsigned scalar_cost = 0;
2275 unsigned i;
2276 gimple stmt;
2277 slp_tree child;
2278
2279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2280 {
2281 unsigned stmt_cost;
2282 ssa_op_iter op_iter;
2283 def_operand_p def_p;
2284 stmt_vec_info stmt_info;
2285
2286 if ((*life)[i])
2287 continue;
2288
2289 /* If there is a non-vectorized use of the defs then the scalar
2290 stmt is kept live in which case we do not account it or any
2291 required defs in the SLP children in the scalar cost. This
2292 way we make the vectorization more costly when compared to
2293 the scalar cost. */
2294 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2295 {
2296 imm_use_iterator use_iter;
2297 gimple use_stmt;
2298 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2299 if (!is_gimple_debug (use_stmt)
2300 && (gimple_code (use_stmt) == GIMPLE_PHI
2301 || gimple_bb (use_stmt) != bb
2302 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2303 {
2304 (*life)[i] = true;
2305 BREAK_FROM_IMM_USE_STMT (use_iter);
2306 }
2307 }
2308 if ((*life)[i])
2309 continue;
2310
2311 stmt_info = vinfo_for_stmt (stmt);
2312 if (STMT_VINFO_DATA_REF (stmt_info))
2313 {
2314 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2315 stmt_cost = vect_get_stmt_cost (scalar_load);
2316 else
2317 stmt_cost = vect_get_stmt_cost (scalar_store);
2318 }
2319 else
2320 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2321
2322 scalar_cost += stmt_cost;
2323 }
2324
2325 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2326 if (child)
2327 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2328
2329 return scalar_cost;
2330 }
2331
2332 /* Check if vectorization of the basic block is profitable. */
2333
2334 static bool
2335 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2336 {
2337 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2338 slp_instance instance;
2339 int i;
2340 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2341 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2342
2343 /* Calculate scalar cost. */
2344 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2345 {
2346 auto_vec<bool, 20> life;
2347 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2348 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2349 SLP_INSTANCE_TREE (instance),
2350 &life);
2351 }
2352
2353 /* Complete the target-specific cost calculation. */
2354 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2355 &vec_inside_cost, &vec_epilogue_cost);
2356
2357 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2358
2359 if (dump_enabled_p ())
2360 {
2361 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2362 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2363 vec_inside_cost);
2364 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2365 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2366 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2367 }
2368
2369 /* Vectorization is profitable if its cost is less than the cost of scalar
2370 version. */
2371 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2372 return false;
2373
2374 return true;
2375 }
2376
2377 /* Check if the basic block can be vectorized. */
2378
2379 static bb_vec_info
2380 vect_slp_analyze_bb_1 (basic_block bb)
2381 {
2382 bb_vec_info bb_vinfo;
2383 vec<slp_instance> slp_instances;
2384 slp_instance instance;
2385 int i;
2386 int min_vf = 2;
2387 unsigned n_stmts = 0;
2388
2389 bb_vinfo = new_bb_vec_info (bb);
2390 if (!bb_vinfo)
2391 return NULL;
2392
2393 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2394 {
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "not vectorized: unhandled data-ref in basic "
2398 "block.\n");
2399
2400 destroy_bb_vec_info (bb_vinfo);
2401 return NULL;
2402 }
2403
2404 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2405 {
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "not vectorized: not enough data-refs in "
2409 "basic block.\n");
2410
2411 destroy_bb_vec_info (bb_vinfo);
2412 return NULL;
2413 }
2414
2415 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2416 {
2417 if (dump_enabled_p ())
2418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2419 "not vectorized: unhandled data access in "
2420 "basic block.\n");
2421
2422 destroy_bb_vec_info (bb_vinfo);
2423 return NULL;
2424 }
2425
2426 vect_pattern_recog (NULL, bb_vinfo);
2427
2428 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2429 {
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "not vectorized: bad data alignment in basic "
2433 "block.\n");
2434
2435 destroy_bb_vec_info (bb_vinfo);
2436 return NULL;
2437 }
2438
2439 /* Check the SLP opportunities in the basic block, analyze and build SLP
2440 trees. */
2441 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2442 {
2443 if (dump_enabled_p ())
2444 {
2445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2446 "Failed to SLP the basic block.\n");
2447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448 "not vectorized: failed to find SLP opportunities "
2449 "in basic block.\n");
2450 }
2451
2452 destroy_bb_vec_info (bb_vinfo);
2453 return NULL;
2454 }
2455
2456 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2457
2458 /* Mark all the statements that we want to vectorize as pure SLP and
2459 relevant. */
2460 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2461 {
2462 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2463 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2464 }
2465
2466 /* Mark all the statements that we do not want to vectorize. */
2467 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2468 !gsi_end_p (gsi); gsi_next (&gsi))
2469 {
2470 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2471 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2472 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2473 }
2474
2475 /* Analyze dependences. At this point all stmts not participating in
2476 vectorization have to be marked. Dependence analysis assumes
2477 that we either vectorize all SLP instances or none at all. */
2478 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2479 {
2480 if (dump_enabled_p ())
2481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2482 "not vectorized: unhandled data dependence "
2483 "in basic block.\n");
2484
2485 destroy_bb_vec_info (bb_vinfo);
2486 return NULL;
2487 }
2488
2489 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2490 {
2491 if (dump_enabled_p ())
2492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493 "not vectorized: unsupported alignment in basic "
2494 "block.\n");
2495 destroy_bb_vec_info (bb_vinfo);
2496 return NULL;
2497 }
2498
2499 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2500 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2501 {
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504 "not vectorized: bad operation in basic block.\n");
2505
2506 destroy_bb_vec_info (bb_vinfo);
2507 return NULL;
2508 }
2509
2510 /* Cost model: check if the vectorization is worthwhile. */
2511 if (!unlimited_cost_model (NULL)
2512 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2513 {
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2516 "not vectorized: vectorization is not "
2517 "profitable.\n");
2518
2519 destroy_bb_vec_info (bb_vinfo);
2520 return NULL;
2521 }
2522
2523 if (dump_enabled_p ())
2524 dump_printf_loc (MSG_NOTE, vect_location,
2525 "Basic block will be vectorized using SLP\n");
2526
2527 return bb_vinfo;
2528 }
2529
2530
2531 bb_vec_info
2532 vect_slp_analyze_bb (basic_block bb)
2533 {
2534 bb_vec_info bb_vinfo;
2535 int insns = 0;
2536 gimple_stmt_iterator gsi;
2537 unsigned int vector_sizes;
2538
2539 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2541
2542 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2543 {
2544 gimple stmt = gsi_stmt (gsi);
2545 if (!is_gimple_debug (stmt)
2546 && !gimple_nop_p (stmt)
2547 && gimple_code (stmt) != GIMPLE_LABEL)
2548 insns++;
2549 }
2550
2551 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2552 {
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "not vectorized: too many instructions in "
2556 "basic block.\n");
2557
2558 return NULL;
2559 }
2560
2561 /* Autodetect first vector size we try. */
2562 current_vector_size = 0;
2563 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2564
2565 while (1)
2566 {
2567 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2568 if (bb_vinfo)
2569 return bb_vinfo;
2570
2571 destroy_bb_vec_info (bb_vinfo);
2572
2573 vector_sizes &= ~current_vector_size;
2574 if (vector_sizes == 0
2575 || current_vector_size == 0)
2576 return NULL;
2577
2578 /* Try the next biggest vector size. */
2579 current_vector_size = 1 << floor_log2 (vector_sizes);
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_NOTE, vect_location,
2582 "***** Re-trying analysis with "
2583 "vector size %d\n", current_vector_size);
2584 }
2585 }
2586
2587
2588 /* For constant and loop invariant defs of SLP_NODE this function returns
2589 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2590 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2591 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2592 REDUC_INDEX is the index of the reduction operand in the statements, unless
2593 it is -1. */
2594
2595 static void
2596 vect_get_constant_vectors (tree op, slp_tree slp_node,
2597 vec<tree> *vec_oprnds,
2598 unsigned int op_num, unsigned int number_of_vectors,
2599 int reduc_index)
2600 {
2601 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2602 gimple stmt = stmts[0];
2603 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2604 unsigned nunits;
2605 tree vec_cst;
2606 tree *elts;
2607 unsigned j, number_of_places_left_in_vector;
2608 tree vector_type;
2609 tree vop;
2610 int group_size = stmts.length ();
2611 unsigned int vec_num, i;
2612 unsigned number_of_copies = 1;
2613 vec<tree> voprnds;
2614 voprnds.create (number_of_vectors);
2615 bool constant_p, is_store;
2616 tree neutral_op = NULL;
2617 enum tree_code code = gimple_expr_code (stmt);
2618 gimple def_stmt;
2619 struct loop *loop;
2620 gimple_seq ctor_seq = NULL;
2621
2622 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2623 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2624
2625 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2626 && reduc_index != -1)
2627 {
2628 op_num = reduc_index;
2629 op = gimple_op (stmt, op_num + 1);
2630 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2631 we need either neutral operands or the original operands. See
2632 get_initial_def_for_reduction() for details. */
2633 switch (code)
2634 {
2635 case WIDEN_SUM_EXPR:
2636 case DOT_PROD_EXPR:
2637 case SAD_EXPR:
2638 case PLUS_EXPR:
2639 case MINUS_EXPR:
2640 case BIT_IOR_EXPR:
2641 case BIT_XOR_EXPR:
2642 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2643 neutral_op = build_real (TREE_TYPE (op), dconst0);
2644 else
2645 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2646
2647 break;
2648
2649 case MULT_EXPR:
2650 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2651 neutral_op = build_real (TREE_TYPE (op), dconst1);
2652 else
2653 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2654
2655 break;
2656
2657 case BIT_AND_EXPR:
2658 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2659 break;
2660
2661 /* For MIN/MAX we don't have an easy neutral operand but
2662 the initial values can be used fine here. Only for
2663 a reduction chain we have to force a neutral element. */
2664 case MAX_EXPR:
2665 case MIN_EXPR:
2666 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2667 neutral_op = NULL;
2668 else
2669 {
2670 def_stmt = SSA_NAME_DEF_STMT (op);
2671 loop = (gimple_bb (stmt))->loop_father;
2672 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2673 loop_preheader_edge (loop));
2674 }
2675 break;
2676
2677 default:
2678 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2679 neutral_op = NULL;
2680 }
2681 }
2682
2683 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2684 {
2685 is_store = true;
2686 op = gimple_assign_rhs1 (stmt);
2687 }
2688 else
2689 is_store = false;
2690
2691 gcc_assert (op);
2692
2693 if (CONSTANT_CLASS_P (op))
2694 constant_p = true;
2695 else
2696 constant_p = false;
2697
2698 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2699 created vectors. It is greater than 1 if unrolling is performed.
2700
2701 For example, we have two scalar operands, s1 and s2 (e.g., group of
2702 strided accesses of size two), while NUNITS is four (i.e., four scalars
2703 of this type can be packed in a vector). The output vector will contain
2704 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2705 will be 2).
2706
2707 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2708 containing the operands.
2709
2710 For example, NUNITS is four as before, and the group size is 8
2711 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2712 {s5, s6, s7, s8}. */
2713
2714 number_of_copies = nunits * number_of_vectors / group_size;
2715
2716 number_of_places_left_in_vector = nunits;
2717 elts = XALLOCAVEC (tree, nunits);
2718 bool place_after_defs = false;
2719 for (j = 0; j < number_of_copies; j++)
2720 {
2721 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2722 {
2723 if (is_store)
2724 op = gimple_assign_rhs1 (stmt);
2725 else
2726 {
2727 switch (code)
2728 {
2729 case COND_EXPR:
2730 if (op_num == 0 || op_num == 1)
2731 {
2732 tree cond = gimple_assign_rhs1 (stmt);
2733 op = TREE_OPERAND (cond, op_num);
2734 }
2735 else
2736 {
2737 if (op_num == 2)
2738 op = gimple_assign_rhs2 (stmt);
2739 else
2740 op = gimple_assign_rhs3 (stmt);
2741 }
2742 break;
2743
2744 case CALL_EXPR:
2745 op = gimple_call_arg (stmt, op_num);
2746 break;
2747
2748 case LSHIFT_EXPR:
2749 case RSHIFT_EXPR:
2750 case LROTATE_EXPR:
2751 case RROTATE_EXPR:
2752 op = gimple_op (stmt, op_num + 1);
2753 /* Unlike the other binary operators, shifts/rotates have
2754 the shift count being int, instead of the same type as
2755 the lhs, so make sure the scalar is the right type if
2756 we are dealing with vectors of
2757 long long/long/short/char. */
2758 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2759 op = fold_convert (TREE_TYPE (vector_type), op);
2760 break;
2761
2762 default:
2763 op = gimple_op (stmt, op_num + 1);
2764 break;
2765 }
2766 }
2767
2768 if (reduc_index != -1)
2769 {
2770 loop = (gimple_bb (stmt))->loop_father;
2771 def_stmt = SSA_NAME_DEF_STMT (op);
2772
2773 gcc_assert (loop);
2774
2775 /* Get the def before the loop. In reduction chain we have only
2776 one initial value. */
2777 if ((j != (number_of_copies - 1)
2778 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2779 && i != 0))
2780 && neutral_op)
2781 op = neutral_op;
2782 else
2783 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2784 loop_preheader_edge (loop));
2785 }
2786
2787 /* Create 'vect_ = {op0,op1,...,opn}'. */
2788 number_of_places_left_in_vector--;
2789 tree orig_op = op;
2790 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2791 {
2792 if (CONSTANT_CLASS_P (op))
2793 {
2794 op = fold_unary (VIEW_CONVERT_EXPR,
2795 TREE_TYPE (vector_type), op);
2796 gcc_assert (op && CONSTANT_CLASS_P (op));
2797 }
2798 else
2799 {
2800 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2801 gimple init_stmt;
2802 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2803 init_stmt
2804 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2805 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2806 op = new_temp;
2807 }
2808 }
2809 elts[number_of_places_left_in_vector] = op;
2810 if (!CONSTANT_CLASS_P (op))
2811 constant_p = false;
2812 if (TREE_CODE (orig_op) == SSA_NAME
2813 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2814 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2815 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2816 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2817 place_after_defs = true;
2818
2819 if (number_of_places_left_in_vector == 0)
2820 {
2821 number_of_places_left_in_vector = nunits;
2822
2823 if (constant_p)
2824 vec_cst = build_vector (vector_type, elts);
2825 else
2826 {
2827 vec<constructor_elt, va_gc> *v;
2828 unsigned k;
2829 vec_alloc (v, nunits);
2830 for (k = 0; k < nunits; ++k)
2831 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2832 vec_cst = build_constructor (vector_type, v);
2833 }
2834 tree init;
2835 gimple_stmt_iterator gsi;
2836 if (place_after_defs)
2837 {
2838 gsi = gsi_for_stmt
2839 (vect_find_last_scalar_stmt_in_slp (slp_node));
2840 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2841 }
2842 else
2843 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2844 if (ctor_seq != NULL)
2845 {
2846 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2847 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2848 GSI_SAME_STMT);
2849 ctor_seq = NULL;
2850 }
2851 voprnds.quick_push (init);
2852 place_after_defs = false;
2853 }
2854 }
2855 }
2856
2857 /* Since the vectors are created in the reverse order, we should invert
2858 them. */
2859 vec_num = voprnds.length ();
2860 for (j = vec_num; j != 0; j--)
2861 {
2862 vop = voprnds[j - 1];
2863 vec_oprnds->quick_push (vop);
2864 }
2865
2866 voprnds.release ();
2867
2868 /* In case that VF is greater than the unrolling factor needed for the SLP
2869 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2870 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2871 to replicate the vectors. */
2872 while (number_of_vectors > vec_oprnds->length ())
2873 {
2874 tree neutral_vec = NULL;
2875
2876 if (neutral_op)
2877 {
2878 if (!neutral_vec)
2879 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2880
2881 vec_oprnds->quick_push (neutral_vec);
2882 }
2883 else
2884 {
2885 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2886 vec_oprnds->quick_push (vop);
2887 }
2888 }
2889 }
2890
2891
2892 /* Get vectorized definitions from SLP_NODE that contains corresponding
2893 vectorized def-stmts. */
2894
2895 static void
2896 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2897 {
2898 tree vec_oprnd;
2899 gimple vec_def_stmt;
2900 unsigned int i;
2901
2902 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2903
2904 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2905 {
2906 gcc_assert (vec_def_stmt);
2907 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2908 vec_oprnds->quick_push (vec_oprnd);
2909 }
2910 }
2911
2912
2913 /* Get vectorized definitions for SLP_NODE.
2914 If the scalar definitions are loop invariants or constants, collect them and
2915 call vect_get_constant_vectors() to create vector stmts.
2916 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2917 must be stored in the corresponding child of SLP_NODE, and we call
2918 vect_get_slp_vect_defs () to retrieve them. */
2919
2920 void
2921 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2922 vec<vec<tree> > *vec_oprnds, int reduc_index)
2923 {
2924 gimple first_stmt;
2925 int number_of_vects = 0, i;
2926 unsigned int child_index = 0;
2927 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2928 slp_tree child = NULL;
2929 vec<tree> vec_defs;
2930 tree oprnd;
2931 bool vectorized_defs;
2932
2933 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2934 FOR_EACH_VEC_ELT (ops, i, oprnd)
2935 {
2936 /* For each operand we check if it has vectorized definitions in a child
2937 node or we need to create them (for invariants and constants). We
2938 check if the LHS of the first stmt of the next child matches OPRND.
2939 If it does, we found the correct child. Otherwise, we call
2940 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2941 to check this child node for the next operand. */
2942 vectorized_defs = false;
2943 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2944 {
2945 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2946
2947 /* We have to check both pattern and original def, if available. */
2948 if (child)
2949 {
2950 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2951 gimple related
2952 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2953
2954 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2955 || (related
2956 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2957 {
2958 /* The number of vector defs is determined by the number of
2959 vector statements in the node from which we get those
2960 statements. */
2961 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2962 vectorized_defs = true;
2963 child_index++;
2964 }
2965 }
2966 else
2967 child_index++;
2968 }
2969
2970 if (!vectorized_defs)
2971 {
2972 if (i == 0)
2973 {
2974 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2975 /* Number of vector stmts was calculated according to LHS in
2976 vect_schedule_slp_instance (), fix it by replacing LHS with
2977 RHS, if necessary. See vect_get_smallest_scalar_type () for
2978 details. */
2979 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2980 &rhs_size_unit);
2981 if (rhs_size_unit != lhs_size_unit)
2982 {
2983 number_of_vects *= rhs_size_unit;
2984 number_of_vects /= lhs_size_unit;
2985 }
2986 }
2987 }
2988
2989 /* Allocate memory for vectorized defs. */
2990 vec_defs = vNULL;
2991 vec_defs.create (number_of_vects);
2992
2993 /* For reduction defs we call vect_get_constant_vectors (), since we are
2994 looking for initial loop invariant values. */
2995 if (vectorized_defs && reduc_index == -1)
2996 /* The defs are already vectorized. */
2997 vect_get_slp_vect_defs (child, &vec_defs);
2998 else
2999 /* Build vectors from scalar defs. */
3000 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3001 number_of_vects, reduc_index);
3002
3003 vec_oprnds->quick_push (vec_defs);
3004
3005 /* For reductions, we only need initial values. */
3006 if (reduc_index != -1)
3007 return;
3008 }
3009 }
3010
3011
3012 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3013 building a vector of type MASK_TYPE from it) and two input vectors placed in
3014 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3015 shifting by STRIDE elements of DR_CHAIN for every copy.
3016 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3017 copies).
3018 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3019 the created stmts must be inserted. */
3020
3021 static inline void
3022 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3023 tree mask, int first_vec_indx, int second_vec_indx,
3024 gimple_stmt_iterator *gsi, slp_tree node,
3025 tree vectype, vec<tree> dr_chain,
3026 int ncopies, int vect_stmts_counter)
3027 {
3028 tree perm_dest;
3029 gimple perm_stmt = NULL;
3030 stmt_vec_info next_stmt_info;
3031 int i, stride;
3032 tree first_vec, second_vec, data_ref;
3033
3034 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3035
3036 /* Initialize the vect stmts of NODE to properly insert the generated
3037 stmts later. */
3038 for (i = SLP_TREE_VEC_STMTS (node).length ();
3039 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3040 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3041
3042 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3043 for (i = 0; i < ncopies; i++)
3044 {
3045 first_vec = dr_chain[first_vec_indx];
3046 second_vec = dr_chain[second_vec_indx];
3047
3048 /* Generate the permute statement. */
3049 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3050 first_vec, second_vec, mask);
3051 data_ref = make_ssa_name (perm_dest, perm_stmt);
3052 gimple_set_lhs (perm_stmt, data_ref);
3053 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3054
3055 /* Store the vector statement in NODE. */
3056 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3057
3058 first_vec_indx += stride;
3059 second_vec_indx += stride;
3060 }
3061
3062 /* Mark the scalar stmt as vectorized. */
3063 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3064 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3065 }
3066
3067
3068 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3069 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3070 representation. Check that the mask is valid and return FALSE if not.
3071 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3072 the next vector, i.e., the current first vector is not needed. */
3073
3074 static bool
3075 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3076 int mask_nunits, bool only_one_vec, int index,
3077 unsigned char *mask, int *current_mask_element,
3078 bool *need_next_vector, int *number_of_mask_fixes,
3079 bool *mask_fixed, bool *needs_first_vector)
3080 {
3081 int i;
3082
3083 /* Convert to target specific representation. */
3084 *current_mask_element = first_mask_element + m;
3085 /* Adjust the value in case it's a mask for second and third vectors. */
3086 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3087
3088 if (*current_mask_element < 0)
3089 {
3090 if (dump_enabled_p ())
3091 {
3092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3093 "permutation requires past vector ");
3094 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3095 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3096 }
3097 return false;
3098 }
3099
3100 if (*current_mask_element < mask_nunits)
3101 *needs_first_vector = true;
3102
3103 /* We have only one input vector to permute but the mask accesses values in
3104 the next vector as well. */
3105 if (only_one_vec && *current_mask_element >= mask_nunits)
3106 {
3107 if (dump_enabled_p ())
3108 {
3109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3110 "permutation requires at least two vectors ");
3111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3112 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3113 }
3114
3115 return false;
3116 }
3117
3118 /* The mask requires the next vector. */
3119 while (*current_mask_element >= mask_nunits * 2)
3120 {
3121 if (*needs_first_vector || *mask_fixed)
3122 {
3123 /* We either need the first vector too or have already moved to the
3124 next vector. In both cases, this permutation needs three
3125 vectors. */
3126 if (dump_enabled_p ())
3127 {
3128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3129 "permutation requires at "
3130 "least three vectors ");
3131 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3132 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3133 }
3134
3135 return false;
3136 }
3137
3138 /* We move to the next vector, dropping the first one and working with
3139 the second and the third - we need to adjust the values of the mask
3140 accordingly. */
3141 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3142
3143 for (i = 0; i < index; i++)
3144 mask[i] -= mask_nunits * *number_of_mask_fixes;
3145
3146 (*number_of_mask_fixes)++;
3147 *mask_fixed = true;
3148 }
3149
3150 *need_next_vector = *mask_fixed;
3151
3152 /* This was the last element of this mask. Start a new one. */
3153 if (index == mask_nunits - 1)
3154 {
3155 *number_of_mask_fixes = 1;
3156 *mask_fixed = false;
3157 *needs_first_vector = false;
3158 }
3159
3160 return true;
3161 }
3162
3163
3164 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3165 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3166 permute statements for the SLP node NODE of the SLP instance
3167 SLP_NODE_INSTANCE. */
3168
3169 bool
3170 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3171 gimple_stmt_iterator *gsi, int vf,
3172 slp_instance slp_node_instance, bool analyze_only)
3173 {
3174 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3175 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3176 tree mask_element_type = NULL_TREE, mask_type;
3177 int i, j, k, nunits, vec_index = 0, scalar_index;
3178 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3179 gimple next_scalar_stmt;
3180 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3181 int first_mask_element;
3182 int index, unroll_factor, current_mask_element, ncopies;
3183 unsigned char *mask;
3184 bool only_one_vec = false, need_next_vector = false;
3185 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3186 int number_of_mask_fixes = 1;
3187 bool mask_fixed = false;
3188 bool needs_first_vector = false;
3189 machine_mode mode;
3190
3191 mode = TYPE_MODE (vectype);
3192
3193 if (!can_vec_perm_p (mode, false, NULL))
3194 {
3195 if (dump_enabled_p ())
3196 {
3197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3198 "no vect permute for ");
3199 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3200 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3201 }
3202 return false;
3203 }
3204
3205 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3206 same size as the vector element being permuted. */
3207 mask_element_type = lang_hooks.types.type_for_mode
3208 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3209 mask_type = get_vectype_for_scalar_type (mask_element_type);
3210 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3211 mask = XALLOCAVEC (unsigned char, nunits);
3212 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3213
3214 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3215 unrolling factor. */
3216 orig_vec_stmts_num = group_size *
3217 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3218 if (orig_vec_stmts_num == 1)
3219 only_one_vec = true;
3220
3221 /* Number of copies is determined by the final vectorization factor
3222 relatively to SLP_NODE_INSTANCE unrolling factor. */
3223 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3224
3225 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3226 return false;
3227
3228 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3229
3230 /* Generate permutation masks for every NODE. Number of masks for each NODE
3231 is equal to GROUP_SIZE.
3232 E.g., we have a group of three nodes with three loads from the same
3233 location in each node, and the vector size is 4. I.e., we have a
3234 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3235 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3236 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3237 ...
3238
3239 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3240 The last mask is illegal since we assume two operands for permute
3241 operation, and the mask element values can't be outside that range.
3242 Hence, the last mask must be converted into {2,5,5,5}.
3243 For the first two permutations we need the first and the second input
3244 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3245 we need the second and the third vectors: {b1,c1,a2,b2} and
3246 {c2,a3,b3,c3}. */
3247
3248 {
3249 scalar_index = 0;
3250 index = 0;
3251 vect_stmts_counter = 0;
3252 vec_index = 0;
3253 first_vec_index = vec_index++;
3254 if (only_one_vec)
3255 second_vec_index = first_vec_index;
3256 else
3257 second_vec_index = vec_index++;
3258
3259 for (j = 0; j < unroll_factor; j++)
3260 {
3261 for (k = 0; k < group_size; k++)
3262 {
3263 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3264 first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3265 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3266 nunits, only_one_vec, index,
3267 mask, &current_mask_element,
3268 &need_next_vector,
3269 &number_of_mask_fixes, &mask_fixed,
3270 &needs_first_vector))
3271 return false;
3272 gcc_assert (current_mask_element >= 0
3273 && current_mask_element < 2 * nunits);
3274 mask[index++] = current_mask_element;
3275
3276 if (index == nunits)
3277 {
3278 index = 0;
3279 if (!can_vec_perm_p (mode, false, mask))
3280 {
3281 if (dump_enabled_p ())
3282 {
3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3284 vect_location,
3285 "unsupported vect permute { ");
3286 for (i = 0; i < nunits; ++i)
3287 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3288 mask[i]);
3289 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3290 }
3291 return false;
3292 }
3293
3294 if (!analyze_only)
3295 {
3296 int l;
3297 tree mask_vec, *mask_elts;
3298 mask_elts = XALLOCAVEC (tree, nunits);
3299 for (l = 0; l < nunits; ++l)
3300 mask_elts[l] = build_int_cst (mask_element_type,
3301 mask[l]);
3302 mask_vec = build_vector (mask_type, mask_elts);
3303
3304 if (need_next_vector)
3305 {
3306 first_vec_index = second_vec_index;
3307 second_vec_index = vec_index;
3308 }
3309
3310 next_scalar_stmt
3311 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3312
3313 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3314 mask_vec, first_vec_index, second_vec_index,
3315 gsi, node, vectype, dr_chain,
3316 ncopies, vect_stmts_counter++);
3317 }
3318 }
3319 }
3320 }
3321 }
3322
3323 return true;
3324 }
3325
3326
3327
3328 /* Vectorize SLP instance tree in postorder. */
3329
3330 static bool
3331 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3332 unsigned int vectorization_factor)
3333 {
3334 gimple stmt;
3335 bool grouped_store, is_store;
3336 gimple_stmt_iterator si;
3337 stmt_vec_info stmt_info;
3338 unsigned int vec_stmts_size, nunits, group_size;
3339 tree vectype;
3340 int i;
3341 slp_tree child;
3342
3343 if (!node)
3344 return false;
3345
3346 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3347 vect_schedule_slp_instance (child, instance, vectorization_factor);
3348
3349 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3350 stmt_info = vinfo_for_stmt (stmt);
3351
3352 /* VECTYPE is the type of the destination. */
3353 vectype = STMT_VINFO_VECTYPE (stmt_info);
3354 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3355 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3356
3357 /* For each SLP instance calculate number of vector stmts to be created
3358 for the scalar stmts in each node of the SLP tree. Number of vector
3359 elements in one vector iteration is the number of scalar elements in
3360 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3361 size.
3362 Unless this is a SLP reduction in which case the number of vector
3363 stmts is equal to the number of vector stmts of the children. */
3364 if (GROUP_FIRST_ELEMENT (stmt_info)
3365 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3366 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3367 else
3368 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3369
3370 if (!SLP_TREE_VEC_STMTS (node).exists ())
3371 {
3372 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3373 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3374 }
3375
3376 if (dump_enabled_p ())
3377 {
3378 dump_printf_loc (MSG_NOTE,vect_location,
3379 "------>vectorizing SLP node starting from: ");
3380 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3381 dump_printf (MSG_NOTE, "\n");
3382 }
3383
3384 /* Vectorized stmts go before the last scalar stmt which is where
3385 all uses are ready. */
3386 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3387
3388 /* Mark the first element of the reduction chain as reduction to properly
3389 transform the node. In the analysis phase only the last element of the
3390 chain is marked as reduction. */
3391 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3392 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3393 {
3394 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3395 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3396 }
3397
3398 /* Handle two-operation SLP nodes by vectorizing the group with
3399 both operations and then performing a merge. */
3400 if (SLP_TREE_TWO_OPERATORS (node))
3401 {
3402 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3403 enum tree_code ocode;
3404 gimple ostmt;
3405 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3406 bool allsame = true;
3407 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3408 if (gimple_assign_rhs_code (ostmt) != code0)
3409 {
3410 mask[i] = 1;
3411 allsame = false;
3412 ocode = gimple_assign_rhs_code (ostmt);
3413 }
3414 else
3415 mask[i] = 0;
3416 if (!allsame)
3417 {
3418 vec<gimple> v0;
3419 vec<gimple> v1;
3420 unsigned j;
3421 tree tmask = NULL_TREE;
3422 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3423 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3424 SLP_TREE_VEC_STMTS (node).truncate (0);
3425 gimple_assign_set_rhs_code (stmt, ocode);
3426 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3427 gimple_assign_set_rhs_code (stmt, code0);
3428 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3429 SLP_TREE_VEC_STMTS (node).truncate (0);
3430 tree meltype = build_nonstandard_integer_type
3431 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3432 tree mvectype = get_same_sized_vectype (meltype, vectype);
3433 unsigned k = 0, l;
3434 for (j = 0; j < v0.length (); ++j)
3435 {
3436 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3437 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3438 {
3439 if (k >= group_size)
3440 k = 0;
3441 melts[l] = build_int_cst
3442 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3443 }
3444 tmask = build_vector (mvectype, melts);
3445
3446 /* ??? Not all targets support a VEC_PERM_EXPR with a
3447 constant mask that would translate to a vec_merge RTX
3448 (with their vec_perm_const_ok). We can either not
3449 vectorize in that case or let veclower do its job.
3450 Unfortunately that isn't too great and at least for
3451 plus/minus we'd eventually like to match targets
3452 vector addsub instructions. */
3453 gimple vstmt;
3454 vstmt = gimple_build_assign (make_ssa_name (vectype),
3455 VEC_PERM_EXPR,
3456 gimple_assign_lhs (v0[j]),
3457 gimple_assign_lhs (v1[j]), tmask);
3458 vect_finish_stmt_generation (stmt, vstmt, &si);
3459 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3460 }
3461 v0.release ();
3462 v1.release ();
3463 return false;
3464 }
3465 }
3466 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3467 return is_store;
3468 }
3469
3470 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3471 For loop vectorization this is done in vectorizable_call, but for SLP
3472 it needs to be deferred until end of vect_schedule_slp, because multiple
3473 SLP instances may refer to the same scalar stmt. */
3474
3475 static void
3476 vect_remove_slp_scalar_calls (slp_tree node)
3477 {
3478 gimple stmt, new_stmt;
3479 gimple_stmt_iterator gsi;
3480 int i;
3481 slp_tree child;
3482 tree lhs;
3483 stmt_vec_info stmt_info;
3484
3485 if (!node)
3486 return;
3487
3488 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3489 vect_remove_slp_scalar_calls (child);
3490
3491 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3492 {
3493 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3494 continue;
3495 stmt_info = vinfo_for_stmt (stmt);
3496 if (stmt_info == NULL
3497 || is_pattern_stmt_p (stmt_info)
3498 || !PURE_SLP_STMT (stmt_info))
3499 continue;
3500 lhs = gimple_call_lhs (stmt);
3501 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3502 set_vinfo_for_stmt (new_stmt, stmt_info);
3503 set_vinfo_for_stmt (stmt, NULL);
3504 STMT_VINFO_STMT (stmt_info) = new_stmt;
3505 gsi = gsi_for_stmt (stmt);
3506 gsi_replace (&gsi, new_stmt, false);
3507 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3508 }
3509 }
3510
3511 /* Generate vector code for all SLP instances in the loop/basic block. */
3512
3513 bool
3514 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3515 {
3516 vec<slp_instance> slp_instances;
3517 slp_instance instance;
3518 unsigned int i, vf;
3519 bool is_store = false;
3520
3521 if (loop_vinfo)
3522 {
3523 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3524 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3525 }
3526 else
3527 {
3528 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3529 vf = 1;
3530 }
3531
3532 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3533 {
3534 /* Schedule the tree of INSTANCE. */
3535 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3536 instance, vf);
3537 if (dump_enabled_p ())
3538 dump_printf_loc (MSG_NOTE, vect_location,
3539 "vectorizing stmts using SLP.\n");
3540 }
3541
3542 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3543 {
3544 slp_tree root = SLP_INSTANCE_TREE (instance);
3545 gimple store;
3546 unsigned int j;
3547 gimple_stmt_iterator gsi;
3548
3549 /* Remove scalar call stmts. Do not do this for basic-block
3550 vectorization as not all uses may be vectorized.
3551 ??? Why should this be necessary? DCE should be able to
3552 remove the stmts itself.
3553 ??? For BB vectorization we can as well remove scalar
3554 stmts starting from the SLP tree root if they have no
3555 uses. */
3556 if (loop_vinfo)
3557 vect_remove_slp_scalar_calls (root);
3558
3559 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3560 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3561 {
3562 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3563 break;
3564
3565 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3566 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3567 /* Free the attached stmt_vec_info and remove the stmt. */
3568 gsi = gsi_for_stmt (store);
3569 unlink_stmt_vdef (store);
3570 gsi_remove (&gsi, true);
3571 release_defs (store);
3572 free_stmt_vec_info (store);
3573 }
3574 }
3575
3576 return is_store;
3577 }
3578
3579
3580 /* Vectorize the basic block. */
3581
3582 void
3583 vect_slp_transform_bb (basic_block bb)
3584 {
3585 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3586 gimple_stmt_iterator si;
3587
3588 gcc_assert (bb_vinfo);
3589
3590 if (dump_enabled_p ())
3591 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3592
3593 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3594 {
3595 gimple stmt = gsi_stmt (si);
3596 stmt_vec_info stmt_info;
3597
3598 if (dump_enabled_p ())
3599 {
3600 dump_printf_loc (MSG_NOTE, vect_location,
3601 "------>SLPing statement: ");
3602 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3603 dump_printf (MSG_NOTE, "\n");
3604 }
3605
3606 stmt_info = vinfo_for_stmt (stmt);
3607 gcc_assert (stmt_info);
3608
3609 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3610 if (STMT_SLP_TYPE (stmt_info))
3611 {
3612 vect_schedule_slp (NULL, bb_vinfo);
3613 break;
3614 }
3615 }
3616
3617 if (dump_enabled_p ())
3618 dump_printf_loc (MSG_NOTE, vect_location,
3619 "BASIC BLOCK VECTORIZED\n");
3620
3621 destroy_bb_vec_info (bb_vinfo);
3622 }