re PR tree-optimization/66419 (FAIL: gcc.target/aarch64/aapcs64/va_arg-6.c execution...
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "input.h"
28 #include "alias.h"
29 #include "symtab.h"
30 #include "tree.h"
31 #include "fold-const.h"
32 #include "stor-layout.h"
33 #include "target.h"
34 #include "predict.h"
35 #include "hard-reg-set.h"
36 #include "function.h"
37 #include "basic-block.h"
38 #include "gimple-pretty-print.h"
39 #include "tree-ssa-alias.h"
40 #include "internal-fn.h"
41 #include "gimple-expr.h"
42 #include "is-a.h"
43 #include "gimple.h"
44 #include "gimple-iterator.h"
45 #include "gimple-ssa.h"
46 #include "tree-phinodes.h"
47 #include "ssa-iterators.h"
48 #include "stringpool.h"
49 #include "tree-ssanames.h"
50 #include "tree-pass.h"
51 #include "cfgloop.h"
52 #include "rtl.h"
53 #include "flags.h"
54 #include "insn-config.h"
55 #include "expmed.h"
56 #include "dojump.h"
57 #include "explow.h"
58 #include "calls.h"
59 #include "emit-rtl.h"
60 #include "varasm.h"
61 #include "stmt.h"
62 #include "expr.h"
63 #include "recog.h" /* FIXME: for insn_data */
64 #include "insn-codes.h"
65 #include "optabs.h"
66 #include "tree-vectorizer.h"
67 #include "langhooks.h"
68 #include "gimple-walk.h"
69
70 /* Extract the location of the basic block in the source code.
71 Return the basic block location if succeed and NULL if not. */
72
73 source_location
74 find_bb_location (basic_block bb)
75 {
76 gimple stmt = NULL;
77 gimple_stmt_iterator si;
78
79 if (!bb)
80 return UNKNOWN_LOCATION;
81
82 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
83 {
84 stmt = gsi_stmt (si);
85 if (gimple_location (stmt) != UNKNOWN_LOCATION)
86 return gimple_location (stmt);
87 }
88
89 return UNKNOWN_LOCATION;
90 }
91
92
93 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
94
95 static void
96 vect_free_slp_tree (slp_tree node)
97 {
98 int i;
99 slp_tree child;
100
101 if (!node)
102 return;
103
104 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
105 vect_free_slp_tree (child);
106
107 SLP_TREE_CHILDREN (node).release ();
108 SLP_TREE_SCALAR_STMTS (node).release ();
109 SLP_TREE_VEC_STMTS (node).release ();
110 SLP_TREE_LOAD_PERMUTATION (node).release ();
111
112 free (node);
113 }
114
115
116 /* Free the memory allocated for the SLP instance. */
117
118 void
119 vect_free_slp_instance (slp_instance instance)
120 {
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
122 SLP_INSTANCE_LOADS (instance).release ();
123 free (instance);
124 }
125
126
127 /* Create an SLP node for SCALAR_STMTS. */
128
129 static slp_tree
130 vect_create_new_slp_node (vec<gimple> scalar_stmts)
131 {
132 slp_tree node;
133 gimple stmt = scalar_stmts[0];
134 unsigned int nops;
135
136 if (is_gimple_call (stmt))
137 nops = gimple_call_num_args (stmt);
138 else if (is_gimple_assign (stmt))
139 {
140 nops = gimple_num_ops (stmt) - 1;
141 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
142 nops++;
143 }
144 else
145 return NULL;
146
147 node = XNEW (struct _slp_tree);
148 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
149 SLP_TREE_VEC_STMTS (node).create (0);
150 SLP_TREE_CHILDREN (node).create (nops);
151 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
152 SLP_TREE_TWO_OPERATORS (node) = false;
153
154 return node;
155 }
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
177 }
178
179 return oprnds_info;
180 }
181
182
183 /* Free operands info. */
184
185 static void
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
192 {
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
195 }
196
197 oprnds_info.release ();
198 }
199
200
201 /* Find the place of the data-ref in STMT in the interleaving chain that starts
202 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
203
204 static int
205 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
206 {
207 gimple next_stmt = first_stmt;
208 int result = 0;
209
210 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
211 return -1;
212
213 do
214 {
215 if (next_stmt == stmt)
216 return result;
217 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
218 if (next_stmt)
219 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
220 }
221 while (next_stmt);
222
223 return -1;
224 }
225
226
227 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
228 they are of a valid type and that they match the defs of the first stmt of
229 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
230 return -1, if the error could be corrected by swapping operands of the
231 operation return 1, if everything is ok return 0. */
232
233 static int
234 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
235 gimple stmt, unsigned stmt_num,
236 vec<slp_oprnd_info> *oprnds_info)
237 {
238 tree oprnd;
239 unsigned int i, number_of_oprnds;
240 tree def;
241 gimple def_stmt;
242 enum vect_def_type dt = vect_uninitialized_def;
243 struct loop *loop = NULL;
244 bool pattern = false;
245 slp_oprnd_info oprnd_info;
246 int first_op_idx = 1;
247 bool commutative = false;
248 bool first_op_cond = false;
249 bool first = stmt_num == 0;
250 bool second = stmt_num == 1;
251
252 if (loop_vinfo)
253 loop = LOOP_VINFO_LOOP (loop_vinfo);
254
255 if (is_gimple_call (stmt))
256 {
257 number_of_oprnds = gimple_call_num_args (stmt);
258 first_op_idx = 3;
259 }
260 else if (is_gimple_assign (stmt))
261 {
262 enum tree_code code = gimple_assign_rhs_code (stmt);
263 number_of_oprnds = gimple_num_ops (stmt) - 1;
264 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
265 {
266 first_op_cond = true;
267 commutative = true;
268 number_of_oprnds++;
269 }
270 else
271 commutative = commutative_tree_code (code);
272 }
273 else
274 return -1;
275
276 bool swapped = false;
277 for (i = 0; i < number_of_oprnds; i++)
278 {
279 again:
280 if (first_op_cond)
281 {
282 if (i == 0 || i == 1)
283 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
284 swapped ? !i : i);
285 else
286 oprnd = gimple_op (stmt, first_op_idx + i - 1);
287 }
288 else
289 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
290
291 oprnd_info = (*oprnds_info)[i];
292
293 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
294 &def, &dt))
295 {
296 if (dump_enabled_p ())
297 {
298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
299 "Build SLP failed: can't analyze def for ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
302 }
303
304 return -1;
305 }
306
307 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
308 from the pattern. Check that all the stmts of the node are in the
309 pattern. */
310 if (def_stmt && gimple_bb (def_stmt)
311 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
312 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
313 && gimple_code (def_stmt) != GIMPLE_PHI))
314 && vinfo_for_stmt (def_stmt)
315 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
316 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
317 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
318 {
319 pattern = true;
320 if (!first && !oprnd_info->first_pattern
321 /* Allow different pattern state for the defs of the
322 first stmt in reduction chains. */
323 && (oprnd_info->first_dt != vect_reduction_def
324 || (!second && !oprnd_info->second_pattern)))
325 {
326 if (i == 0
327 && !swapped
328 && commutative)
329 {
330 swapped = true;
331 goto again;
332 }
333
334 if (dump_enabled_p ())
335 {
336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
337 "Build SLP failed: some of the stmts"
338 " are in a pattern, and others are not ");
339 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
340 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
341 }
342
343 return 1;
344 }
345
346 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
347 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
348
349 if (dt == vect_unknown_def_type)
350 {
351 if (dump_enabled_p ())
352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
353 "Unsupported pattern.\n");
354 return -1;
355 }
356
357 switch (gimple_code (def_stmt))
358 {
359 case GIMPLE_PHI:
360 def = gimple_phi_result (def_stmt);
361 break;
362
363 case GIMPLE_ASSIGN:
364 def = gimple_assign_lhs (def_stmt);
365 break;
366
367 default:
368 if (dump_enabled_p ())
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "unsupported defining stmt:\n");
371 return -1;
372 }
373 }
374
375 if (second)
376 oprnd_info->second_pattern = pattern;
377
378 if (first)
379 {
380 oprnd_info->first_dt = dt;
381 oprnd_info->first_pattern = pattern;
382 oprnd_info->first_op_type = TREE_TYPE (oprnd);
383 }
384 else
385 {
386 /* Not first stmt of the group, check that the def-stmt/s match
387 the def-stmt/s of the first stmt. Allow different definition
388 types for reduction chains: the first stmt must be a
389 vect_reduction_def (a phi node), and the rest
390 vect_internal_def. */
391 if (((oprnd_info->first_dt != dt
392 && !(oprnd_info->first_dt == vect_reduction_def
393 && dt == vect_internal_def)
394 && !((oprnd_info->first_dt == vect_external_def
395 || oprnd_info->first_dt == vect_constant_def)
396 && (dt == vect_external_def
397 || dt == vect_constant_def)))
398 || !types_compatible_p (oprnd_info->first_op_type,
399 TREE_TYPE (oprnd))))
400 {
401 /* Try swapping operands if we got a mismatch. */
402 if (i == 0
403 && !swapped
404 && commutative)
405 {
406 swapped = true;
407 goto again;
408 }
409
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
413
414 return 1;
415 }
416 }
417
418 /* Check the types of the definitions. */
419 switch (dt)
420 {
421 case vect_constant_def:
422 case vect_external_def:
423 case vect_reduction_def:
424 break;
425
426 case vect_internal_def:
427 oprnd_info->def_stmts.quick_push (def_stmt);
428 break;
429
430 default:
431 /* FORNOW: Not supported. */
432 if (dump_enabled_p ())
433 {
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: illegal type of def ");
436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438 }
439
440 return -1;
441 }
442 }
443
444 /* Swap operands. */
445 if (swapped)
446 {
447 if (first_op_cond)
448 {
449 tree cond = gimple_assign_rhs1 (stmt);
450 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451 &TREE_OPERAND (cond, 1));
452 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453 }
454 else
455 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456 gimple_assign_rhs2_ptr (stmt));
457 }
458
459 return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464 permutation or are of unsupported types of operation. Return
465 true if they are, otherwise return false and indicate in *MATCHES
466 which stmts are not isomorphic to the first one. If MATCHES[0]
467 is false then this indicates the comparison could not be
468 carried out or the stmts will never be vectorized by SLP. */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472 vec<gimple> stmts, unsigned int group_size,
473 unsigned nops, unsigned int *max_nunits,
474 unsigned int vectorization_factor, bool *matches,
475 bool *two_operators)
476 {
477 unsigned int i;
478 gimple first_stmt = stmts[0], stmt = stmts[0];
479 enum tree_code first_stmt_code = ERROR_MARK;
480 enum tree_code alt_stmt_code = ERROR_MARK;
481 enum tree_code rhs_code = ERROR_MARK;
482 enum tree_code first_cond_code = ERROR_MARK;
483 tree lhs;
484 bool need_same_oprnds = false;
485 tree vectype, scalar_type, first_op1 = NULL_TREE;
486 optab optab;
487 int icode;
488 machine_mode optab_op2_mode;
489 machine_mode vec_mode;
490 struct data_reference *first_dr;
491 HOST_WIDE_INT dummy;
492 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
493 tree cond;
494
495 /* For every stmt in NODE find its def stmt/s. */
496 FOR_EACH_VEC_ELT (stmts, i, stmt)
497 {
498 matches[i] = false;
499
500 if (dump_enabled_p ())
501 {
502 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
503 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
504 dump_printf (MSG_NOTE, "\n");
505 }
506
507 /* Fail to vectorize statements marked as unvectorizable. */
508 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
509 {
510 if (dump_enabled_p ())
511 {
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: unvectorizable statement ");
514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
515 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
516 }
517 /* Fatal mismatch. */
518 matches[0] = false;
519 return false;
520 }
521
522 lhs = gimple_get_lhs (stmt);
523 if (lhs == NULL_TREE)
524 {
525 if (dump_enabled_p ())
526 {
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "Build SLP failed: not GIMPLE_ASSIGN nor "
529 "GIMPLE_CALL ");
530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
532 }
533 /* Fatal mismatch. */
534 matches[0] = false;
535 return false;
536 }
537
538 if (is_gimple_assign (stmt)
539 && gimple_assign_rhs_code (stmt) == COND_EXPR
540 && (cond = gimple_assign_rhs1 (stmt))
541 && !COMPARISON_CLASS_P (cond))
542 {
543 if (dump_enabled_p ())
544 {
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "Build SLP failed: condition is not "
547 "comparison ");
548 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
549 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
550 }
551 /* Fatal mismatch. */
552 matches[0] = false;
553 return false;
554 }
555
556 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
557 vectype = get_vectype_for_scalar_type (scalar_type);
558 if (!vectype)
559 {
560 if (dump_enabled_p ())
561 {
562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
563 "Build SLP failed: unsupported data-type ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
565 scalar_type);
566 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
567 }
568 /* Fatal mismatch. */
569 matches[0] = false;
570 return false;
571 }
572
573 /* If populating the vector type requires unrolling then fail
574 before adjusting *max_nunits for basic-block vectorization. */
575 if (bb_vinfo
576 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
577 {
578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579 "Build SLP failed: unrolling required "
580 "in basic block SLP\n");
581 /* Fatal mismatch. */
582 matches[0] = false;
583 return false;
584 }
585
586 /* In case of multiple types we need to detect the smallest type. */
587 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
588 {
589 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
590 if (bb_vinfo)
591 vectorization_factor = *max_nunits;
592 }
593
594 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
595 {
596 rhs_code = CALL_EXPR;
597 if (gimple_call_internal_p (call_stmt)
598 || gimple_call_tail_p (call_stmt)
599 || gimple_call_noreturn_p (call_stmt)
600 || !gimple_call_nothrow_p (call_stmt)
601 || gimple_call_chain (call_stmt))
602 {
603 if (dump_enabled_p ())
604 {
605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
606 "Build SLP failed: unsupported call type ");
607 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
608 call_stmt, 0);
609 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
610 }
611 /* Fatal mismatch. */
612 matches[0] = false;
613 return false;
614 }
615 }
616 else
617 rhs_code = gimple_assign_rhs_code (stmt);
618
619 /* Check the operation. */
620 if (i == 0)
621 {
622 first_stmt_code = rhs_code;
623
624 /* Shift arguments should be equal in all the packed stmts for a
625 vector shift with scalar shift operand. */
626 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
627 || rhs_code == LROTATE_EXPR
628 || rhs_code == RROTATE_EXPR)
629 {
630 vec_mode = TYPE_MODE (vectype);
631
632 /* First see if we have a vector/vector shift. */
633 optab = optab_for_tree_code (rhs_code, vectype,
634 optab_vector);
635
636 if (!optab
637 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
638 {
639 /* No vector/vector shift, try for a vector/scalar shift. */
640 optab = optab_for_tree_code (rhs_code, vectype,
641 optab_scalar);
642
643 if (!optab)
644 {
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
647 "Build SLP failed: no optab.\n");
648 /* Fatal mismatch. */
649 matches[0] = false;
650 return false;
651 }
652 icode = (int) optab_handler (optab, vec_mode);
653 if (icode == CODE_FOR_nothing)
654 {
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "Build SLP failed: "
658 "op not supported by target.\n");
659 /* Fatal mismatch. */
660 matches[0] = false;
661 return false;
662 }
663 optab_op2_mode = insn_data[icode].operand[2].mode;
664 if (!VECTOR_MODE_P (optab_op2_mode))
665 {
666 need_same_oprnds = true;
667 first_op1 = gimple_assign_rhs2 (stmt);
668 }
669 }
670 }
671 else if (rhs_code == WIDEN_LSHIFT_EXPR)
672 {
673 need_same_oprnds = true;
674 first_op1 = gimple_assign_rhs2 (stmt);
675 }
676 }
677 else
678 {
679 if (first_stmt_code != rhs_code
680 && alt_stmt_code == ERROR_MARK)
681 alt_stmt_code = rhs_code;
682 if (first_stmt_code != rhs_code
683 && (first_stmt_code != IMAGPART_EXPR
684 || rhs_code != REALPART_EXPR)
685 && (first_stmt_code != REALPART_EXPR
686 || rhs_code != IMAGPART_EXPR)
687 /* Handle mismatches in plus/minus by computing both
688 and merging the results. */
689 && !((first_stmt_code == PLUS_EXPR
690 || first_stmt_code == MINUS_EXPR)
691 && (alt_stmt_code == PLUS_EXPR
692 || alt_stmt_code == MINUS_EXPR)
693 && rhs_code == alt_stmt_code)
694 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
695 && (first_stmt_code == ARRAY_REF
696 || first_stmt_code == BIT_FIELD_REF
697 || first_stmt_code == INDIRECT_REF
698 || first_stmt_code == COMPONENT_REF
699 || first_stmt_code == MEM_REF)))
700 {
701 if (dump_enabled_p ())
702 {
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operation "
705 "in stmt ");
706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "original stmt ");
709 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
710 first_stmt, 0);
711 }
712 /* Mismatch. */
713 continue;
714 }
715
716 if (need_same_oprnds
717 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
718 {
719 if (dump_enabled_p ())
720 {
721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
722 "Build SLP failed: different shift "
723 "arguments in ");
724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
725 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
726 }
727 /* Mismatch. */
728 continue;
729 }
730
731 if (rhs_code == CALL_EXPR)
732 {
733 gimple first_stmt = stmts[0];
734 if (gimple_call_num_args (stmt) != nops
735 || !operand_equal_p (gimple_call_fn (first_stmt),
736 gimple_call_fn (stmt), 0)
737 || gimple_call_fntype (first_stmt)
738 != gimple_call_fntype (stmt))
739 {
740 if (dump_enabled_p ())
741 {
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "Build SLP failed: different calls in ");
744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
745 stmt, 0);
746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
747 }
748 /* Mismatch. */
749 continue;
750 }
751 }
752 }
753
754 /* Grouped store or load. */
755 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
756 {
757 if (REFERENCE_CLASS_P (lhs))
758 {
759 /* Store. */
760 ;
761 }
762 else
763 {
764 /* Load. */
765 unsigned unrolling_factor
766 = least_common_multiple
767 (*max_nunits, group_size) / group_size;
768 /* FORNOW: Check that there is no gap between the loads
769 and no gap between the groups when we need to load
770 multiple groups at once. */
771 if (unrolling_factor > 1
772 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
774 /* If the group is split up then GROUP_GAP
775 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
776 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
777 {
778 if (dump_enabled_p ())
779 {
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: grouped "
782 "loads have gaps ");
783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784 stmt, 0);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786 }
787 /* Fatal mismatch. */
788 matches[0] = false;
789 return false;
790 }
791
792 /* Check that the size of interleaved loads group is not
793 greater than the SLP group size. */
794 unsigned ncopies
795 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
796 if (loop_vinfo
797 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
798 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
799 - GROUP_GAP (vinfo_for_stmt (stmt)))
800 > ncopies * group_size))
801 {
802 if (dump_enabled_p ())
803 {
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: the number "
806 "of interleaved loads is greater than "
807 "the SLP group size ");
808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
809 stmt, 0);
810 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
811 }
812 /* Fatal mismatch. */
813 matches[0] = false;
814 return false;
815 }
816
817 old_first_load = first_load;
818 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
819 if (prev_first_load)
820 {
821 /* Check that there are no loads from different interleaving
822 chains in the same node. */
823 if (prev_first_load != first_load)
824 {
825 if (dump_enabled_p ())
826 {
827 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828 vect_location,
829 "Build SLP failed: different "
830 "interleaving chains in one node ");
831 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832 stmt, 0);
833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834 }
835 /* Mismatch. */
836 continue;
837 }
838 }
839 else
840 prev_first_load = first_load;
841
842 /* In some cases a group of loads is just the same load
843 repeated N times. Only analyze its cost once. */
844 if (first_load == stmt && old_first_load != first_load)
845 {
846 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
847 if (vect_supportable_dr_alignment (first_dr, false)
848 == dr_unaligned_unsupported)
849 {
850 if (dump_enabled_p ())
851 {
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
853 vect_location,
854 "Build SLP failed: unsupported "
855 "unaligned load ");
856 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
857 stmt, 0);
858 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
859 }
860 /* Fatal mismatch. */
861 matches[0] = false;
862 return false;
863 }
864 }
865 }
866 } /* Grouped access. */
867 else
868 {
869 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
870 {
871 /* Not grouped load. */
872 if (dump_enabled_p ())
873 {
874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
875 "Build SLP failed: not grouped load ");
876 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
877 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
878 }
879
880 /* FORNOW: Not grouped loads are not supported. */
881 /* Fatal mismatch. */
882 matches[0] = false;
883 return false;
884 }
885
886 /* Not memory operation. */
887 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
888 && TREE_CODE_CLASS (rhs_code) != tcc_unary
889 && TREE_CODE_CLASS (rhs_code) != tcc_expression
890 && rhs_code != CALL_EXPR)
891 {
892 if (dump_enabled_p ())
893 {
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
895 "Build SLP failed: operation");
896 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
897 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
898 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
899 }
900 /* Fatal mismatch. */
901 matches[0] = false;
902 return false;
903 }
904
905 if (rhs_code == COND_EXPR)
906 {
907 tree cond_expr = gimple_assign_rhs1 (stmt);
908
909 if (i == 0)
910 first_cond_code = TREE_CODE (cond_expr);
911 else if (first_cond_code != TREE_CODE (cond_expr))
912 {
913 if (dump_enabled_p ())
914 {
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: different"
917 " operation");
918 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
919 stmt, 0);
920 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
921 }
922 /* Mismatch. */
923 continue;
924 }
925 }
926 }
927
928 matches[i] = true;
929 }
930
931 for (i = 0; i < group_size; ++i)
932 if (!matches[i])
933 return false;
934
935 /* If we allowed a two-operation SLP node verify the target can cope
936 with the permute we are going to use. */
937 if (alt_stmt_code != ERROR_MARK
938 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
939 {
940 unsigned char *sel
941 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
942 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
943 {
944 sel[i] = i;
945 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
946 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
947 }
948 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
949 {
950 for (i = 0; i < group_size; ++i)
951 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
952 {
953 matches[i] = false;
954 if (dump_enabled_p ())
955 {
956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957 "Build SLP failed: different operation "
958 "in stmt ");
959 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
960 stmts[i], 0);
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "original stmt ");
963 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
964 first_stmt, 0);
965 }
966 }
967 return false;
968 }
969 *two_operators = true;
970 }
971
972 return true;
973 }
974
975 /* Recursively build an SLP tree starting from NODE.
976 Fail (and return a value not equal to zero) if def-stmts are not
977 isomorphic, require data permutation or are of unsupported types of
978 operation. Otherwise, return 0.
979 The value returned is the depth in the SLP tree where a mismatch
980 was found. */
981
982 static bool
983 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
984 slp_tree *node, unsigned int group_size,
985 unsigned int *max_nunits,
986 vec<slp_tree> *loads,
987 unsigned int vectorization_factor,
988 bool *matches, unsigned *npermutes, unsigned *tree_size,
989 unsigned max_tree_size)
990 {
991 unsigned nops, i, this_tree_size = 0;
992 gimple stmt;
993
994 matches[0] = false;
995
996 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
997 if (is_gimple_call (stmt))
998 nops = gimple_call_num_args (stmt);
999 else if (is_gimple_assign (stmt))
1000 {
1001 nops = gimple_num_ops (stmt) - 1;
1002 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1003 nops++;
1004 }
1005 else
1006 return false;
1007
1008 bool two_operators = false;
1009 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
1010 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
1011 max_nunits, vectorization_factor, matches,
1012 &two_operators))
1013 return false;
1014 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
1015
1016 /* If the SLP node is a load, terminate the recursion. */
1017 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1018 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1019 {
1020 loads->safe_push (*node);
1021 return true;
1022 }
1023
1024 /* Get at the operands, verifying they are compatible. */
1025 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1026 slp_oprnd_info oprnd_info;
1027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1028 {
1029 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1030 stmt, i, &oprnds_info))
1031 {
1032 case 0:
1033 break;
1034 case -1:
1035 matches[0] = false;
1036 vect_free_oprnd_info (oprnds_info);
1037 return false;
1038 case 1:
1039 matches[i] = false;
1040 break;
1041 }
1042 }
1043 for (i = 0; i < group_size; ++i)
1044 if (!matches[i])
1045 {
1046 vect_free_oprnd_info (oprnds_info);
1047 return false;
1048 }
1049
1050 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1051
1052 /* Create SLP_TREE nodes for the definition node/s. */
1053 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1054 {
1055 slp_tree child;
1056 unsigned old_nloads = loads->length ();
1057 unsigned old_max_nunits = *max_nunits;
1058
1059 if (oprnd_info->first_dt != vect_internal_def)
1060 continue;
1061
1062 if (++this_tree_size > max_tree_size)
1063 {
1064 vect_free_oprnd_info (oprnds_info);
1065 return false;
1066 }
1067
1068 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1069 if (!child)
1070 {
1071 vect_free_oprnd_info (oprnds_info);
1072 return false;
1073 }
1074
1075 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1076 group_size, max_nunits, loads,
1077 vectorization_factor, matches,
1078 npermutes, &this_tree_size, max_tree_size))
1079 {
1080 /* If we have all children of child built up from scalars then just
1081 throw that away and build it up this node from scalars. */
1082 if (!SLP_TREE_CHILDREN (child).is_empty ())
1083 {
1084 unsigned int j;
1085 slp_tree grandchild;
1086
1087 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1088 if (grandchild != NULL)
1089 break;
1090 if (!grandchild)
1091 {
1092 /* Roll back. */
1093 *max_nunits = old_max_nunits;
1094 loads->truncate (old_nloads);
1095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1096 vect_free_slp_tree (grandchild);
1097 SLP_TREE_CHILDREN (child).truncate (0);
1098
1099 dump_printf_loc (MSG_NOTE, vect_location,
1100 "Building parent vector operands from "
1101 "scalars instead\n");
1102 oprnd_info->def_stmts = vNULL;
1103 vect_free_slp_tree (child);
1104 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1105 continue;
1106 }
1107 }
1108
1109 oprnd_info->def_stmts = vNULL;
1110 SLP_TREE_CHILDREN (*node).quick_push (child);
1111 continue;
1112 }
1113
1114 /* If the SLP build failed fatally and we analyze a basic-block
1115 simply treat nodes we fail to build as externally defined
1116 (and thus build vectors from the scalar defs).
1117 The cost model will reject outright expensive cases.
1118 ??? This doesn't treat cases where permutation ultimatively
1119 fails (or we don't try permutation below). Ideally we'd
1120 even compute a permutation that will end up with the maximum
1121 SLP tree size... */
1122 if (bb_vinfo
1123 && !matches[0]
1124 /* ??? Rejecting patterns this way doesn't work. We'd have to
1125 do extra work to cancel the pattern so the uses see the
1126 scalar version. */
1127 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1128 {
1129 unsigned int j;
1130 slp_tree grandchild;
1131
1132 /* Roll back. */
1133 *max_nunits = old_max_nunits;
1134 loads->truncate (old_nloads);
1135 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1136 vect_free_slp_tree (grandchild);
1137 SLP_TREE_CHILDREN (child).truncate (0);
1138
1139 dump_printf_loc (MSG_NOTE, vect_location,
1140 "Building vector operands from scalars\n");
1141 oprnd_info->def_stmts = vNULL;
1142 vect_free_slp_tree (child);
1143 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1144 continue;
1145 }
1146
1147 /* If the SLP build for operand zero failed and operand zero
1148 and one can be commutated try that for the scalar stmts
1149 that failed the match. */
1150 if (i == 0
1151 /* A first scalar stmt mismatch signals a fatal mismatch. */
1152 && matches[0]
1153 /* ??? For COND_EXPRs we can swap the comparison operands
1154 as well as the arms under some constraints. */
1155 && nops == 2
1156 && oprnds_info[1]->first_dt == vect_internal_def
1157 && is_gimple_assign (stmt)
1158 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1159 && !SLP_TREE_TWO_OPERATORS (*node)
1160 /* Do so only if the number of not successful permutes was nor more
1161 than a cut-ff as re-trying the recursive match on
1162 possibly each level of the tree would expose exponential
1163 behavior. */
1164 && *npermutes < 4)
1165 {
1166 unsigned int j;
1167 slp_tree grandchild;
1168
1169 /* Roll back. */
1170 *max_nunits = old_max_nunits;
1171 loads->truncate (old_nloads);
1172 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1173 vect_free_slp_tree (grandchild);
1174 SLP_TREE_CHILDREN (child).truncate (0);
1175
1176 /* Swap mismatched definition stmts. */
1177 dump_printf_loc (MSG_NOTE, vect_location,
1178 "Re-trying with swapped operands of stmts ");
1179 for (j = 0; j < group_size; ++j)
1180 if (!matches[j])
1181 {
1182 gimple tem = oprnds_info[0]->def_stmts[j];
1183 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1184 oprnds_info[1]->def_stmts[j] = tem;
1185 dump_printf (MSG_NOTE, "%d ", j);
1186 }
1187 dump_printf (MSG_NOTE, "\n");
1188 /* And try again with scratch 'matches' ... */
1189 bool *tem = XALLOCAVEC (bool, group_size);
1190 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1191 group_size, max_nunits, loads,
1192 vectorization_factor,
1193 tem, npermutes, &this_tree_size,
1194 max_tree_size))
1195 {
1196 /* ... so if successful we can apply the operand swapping
1197 to the GIMPLE IL. This is necessary because for example
1198 vect_get_slp_defs uses operand indexes and thus expects
1199 canonical operand order. */
1200 for (j = 0; j < group_size; ++j)
1201 if (!matches[j])
1202 {
1203 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1204 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1205 gimple_assign_rhs2_ptr (stmt));
1206 }
1207 oprnd_info->def_stmts = vNULL;
1208 SLP_TREE_CHILDREN (*node).quick_push (child);
1209 continue;
1210 }
1211
1212 ++*npermutes;
1213 }
1214
1215 oprnd_info->def_stmts = vNULL;
1216 vect_free_slp_tree (child);
1217 vect_free_oprnd_info (oprnds_info);
1218 return false;
1219 }
1220
1221 if (tree_size)
1222 *tree_size += this_tree_size;
1223
1224 vect_free_oprnd_info (oprnds_info);
1225 return true;
1226 }
1227
1228 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1229
1230 static void
1231 vect_print_slp_tree (int dump_kind, slp_tree node)
1232 {
1233 int i;
1234 gimple stmt;
1235 slp_tree child;
1236
1237 if (!node)
1238 return;
1239
1240 dump_printf (dump_kind, "node ");
1241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1242 {
1243 dump_printf (dump_kind, "\n\tstmt %d ", i);
1244 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1245 }
1246 dump_printf (dump_kind, "\n");
1247
1248 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1249 vect_print_slp_tree (dump_kind, child);
1250 }
1251
1252
1253 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1254 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1255 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1256 stmts in NODE are to be marked. */
1257
1258 static void
1259 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1260 {
1261 int i;
1262 gimple stmt;
1263 slp_tree child;
1264
1265 if (!node)
1266 return;
1267
1268 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1269 if (j < 0 || i == j)
1270 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1271
1272 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1273 vect_mark_slp_stmts (child, mark, j);
1274 }
1275
1276
1277 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1278
1279 static void
1280 vect_mark_slp_stmts_relevant (slp_tree node)
1281 {
1282 int i;
1283 gimple stmt;
1284 stmt_vec_info stmt_info;
1285 slp_tree child;
1286
1287 if (!node)
1288 return;
1289
1290 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1291 {
1292 stmt_info = vinfo_for_stmt (stmt);
1293 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1294 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1295 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1296 }
1297
1298 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1299 vect_mark_slp_stmts_relevant (child);
1300 }
1301
1302
1303 /* Rearrange the statements of NODE according to PERMUTATION. */
1304
1305 static void
1306 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1307 vec<unsigned> permutation)
1308 {
1309 gimple stmt;
1310 vec<gimple> tmp_stmts;
1311 unsigned int i;
1312 slp_tree child;
1313
1314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1315 vect_slp_rearrange_stmts (child, group_size, permutation);
1316
1317 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1318 tmp_stmts.create (group_size);
1319 tmp_stmts.quick_grow_cleared (group_size);
1320
1321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1322 tmp_stmts[permutation[i]] = stmt;
1323
1324 SLP_TREE_SCALAR_STMTS (node).release ();
1325 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1326 }
1327
1328
1329 /* Check if the required load permutations in the SLP instance
1330 SLP_INSTN are supported. */
1331
1332 static bool
1333 vect_supported_load_permutation_p (slp_instance slp_instn)
1334 {
1335 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1336 unsigned int i, j, k, next;
1337 sbitmap load_index;
1338 slp_tree node;
1339 gimple stmt, load, next_load, first_load;
1340 struct data_reference *dr;
1341
1342 if (dump_enabled_p ())
1343 {
1344 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1345 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1346 if (node->load_permutation.exists ())
1347 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1348 dump_printf (MSG_NOTE, "%d ", next);
1349 else
1350 for (k = 0; k < group_size; ++k)
1351 dump_printf (MSG_NOTE, "%d ", k);
1352 dump_printf (MSG_NOTE, "\n");
1353 }
1354
1355 /* In case of reduction every load permutation is allowed, since the order
1356 of the reduction statements is not important (as opposed to the case of
1357 grouped stores). The only condition we need to check is that all the
1358 load nodes are of the same size and have the same permutation (and then
1359 rearrange all the nodes of the SLP instance according to this
1360 permutation). */
1361
1362 /* Check that all the load nodes are of the same size. */
1363 /* ??? Can't we assert this? */
1364 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1365 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1366 return false;
1367
1368 node = SLP_INSTANCE_TREE (slp_instn);
1369 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1370
1371 /* Reduction (there are no data-refs in the root).
1372 In reduction chain the order of the loads is important. */
1373 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1374 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1375 {
1376 slp_tree load;
1377 unsigned int lidx;
1378
1379 /* Compare all the permutation sequences to the first one. We know
1380 that at least one load is permuted. */
1381 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1382 if (!node->load_permutation.exists ())
1383 return false;
1384 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1385 {
1386 if (!load->load_permutation.exists ())
1387 return false;
1388 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1389 if (lidx != node->load_permutation[j])
1390 return false;
1391 }
1392
1393 /* Check that the loads in the first sequence are different and there
1394 are no gaps between them. */
1395 load_index = sbitmap_alloc (group_size);
1396 bitmap_clear (load_index);
1397 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1398 {
1399 if (bitmap_bit_p (load_index, lidx))
1400 {
1401 sbitmap_free (load_index);
1402 return false;
1403 }
1404 bitmap_set_bit (load_index, lidx);
1405 }
1406 for (i = 0; i < group_size; i++)
1407 if (!bitmap_bit_p (load_index, i))
1408 {
1409 sbitmap_free (load_index);
1410 return false;
1411 }
1412 sbitmap_free (load_index);
1413
1414 /* This permutation is valid for reduction. Since the order of the
1415 statements in the nodes is not important unless they are memory
1416 accesses, we can rearrange the statements in all the nodes
1417 according to the order of the loads. */
1418 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1419 node->load_permutation);
1420
1421 /* We are done, no actual permutations need to be generated. */
1422 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1423 SLP_TREE_LOAD_PERMUTATION (node).release ();
1424 return true;
1425 }
1426
1427 /* In basic block vectorization we allow any subchain of an interleaving
1428 chain.
1429 FORNOW: not supported in loop SLP because of realignment compications. */
1430 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1431 {
1432 /* Check whether the loads in an instance form a subchain and thus
1433 no permutation is necessary. */
1434 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1435 {
1436 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1437 continue;
1438 bool subchain_p = true;
1439 next_load = NULL;
1440 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1441 {
1442 if (j != 0
1443 && (next_load != load
1444 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1445 {
1446 subchain_p = false;
1447 break;
1448 }
1449 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1450 }
1451 if (subchain_p)
1452 SLP_TREE_LOAD_PERMUTATION (node).release ();
1453 else
1454 {
1455 /* Verify the permutation can be generated. */
1456 vec<tree> tem;
1457 if (!vect_transform_slp_perm_load (node, tem, NULL,
1458 1, slp_instn, true))
1459 {
1460 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1461 vect_location,
1462 "unsupported load permutation\n");
1463 return false;
1464 }
1465 }
1466 }
1467
1468 /* Check that the alignment of the first load in every subchain, i.e.,
1469 the first statement in every load node, is supported.
1470 ??? This belongs in alignment checking. */
1471 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1472 {
1473 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1474 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1475 {
1476 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1477 if (vect_supportable_dr_alignment (dr, false)
1478 == dr_unaligned_unsupported)
1479 {
1480 if (dump_enabled_p ())
1481 {
1482 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1483 vect_location,
1484 "unsupported unaligned load ");
1485 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1486 first_load, 0);
1487 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1488 }
1489 return false;
1490 }
1491 }
1492 }
1493
1494 return true;
1495 }
1496
1497 /* For loop vectorization verify we can generate the permutation. */
1498 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1499 if (node->load_permutation.exists ()
1500 && !vect_transform_slp_perm_load
1501 (node, vNULL, NULL,
1502 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1503 return false;
1504
1505 return true;
1506 }
1507
1508
1509 /* Find the last store in SLP INSTANCE. */
1510
1511 static gimple
1512 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1513 {
1514 gimple last = NULL, stmt;
1515
1516 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1517 {
1518 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1519 if (is_pattern_stmt_p (stmt_vinfo))
1520 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1521 else
1522 last = get_later_stmt (stmt, last);
1523 }
1524
1525 return last;
1526 }
1527
1528 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1529
1530 static void
1531 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1532 stmt_vector_for_cost *prologue_cost_vec,
1533 stmt_vector_for_cost *body_cost_vec,
1534 unsigned ncopies_for_cost)
1535 {
1536 unsigned i;
1537 slp_tree child;
1538 gimple stmt, s;
1539 stmt_vec_info stmt_info;
1540 tree lhs;
1541 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1542
1543 /* Recurse down the SLP tree. */
1544 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1545 if (child)
1546 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1547 body_cost_vec, ncopies_for_cost);
1548
1549 /* Look at the first scalar stmt to determine the cost. */
1550 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1551 stmt_info = vinfo_for_stmt (stmt);
1552 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1553 {
1554 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1555 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1556 vect_uninitialized_def,
1557 node, prologue_cost_vec, body_cost_vec);
1558 else
1559 {
1560 int i;
1561 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1562 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1563 node, prologue_cost_vec, body_cost_vec);
1564 /* If the load is permuted record the cost for the permutation.
1565 ??? Loads from multiple chains are let through here only
1566 for a single special case involving complex numbers where
1567 in the end no permutation is necessary. */
1568 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1569 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1570 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1571 && vect_get_place_in_interleaving_chain
1572 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1573 {
1574 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1575 stmt_info, 0, vect_body);
1576 break;
1577 }
1578 }
1579 }
1580 else
1581 {
1582 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1583 stmt_info, 0, vect_body);
1584 if (SLP_TREE_TWO_OPERATORS (node))
1585 {
1586 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1587 stmt_info, 0, vect_body);
1588 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1589 stmt_info, 0, vect_body);
1590 }
1591 }
1592
1593 /* Scan operands and account for prologue cost of constants/externals.
1594 ??? This over-estimates cost for multiple uses and should be
1595 re-engineered. */
1596 lhs = gimple_get_lhs (stmt);
1597 for (i = 0; i < gimple_num_ops (stmt); ++i)
1598 {
1599 tree def, op = gimple_op (stmt, i);
1600 gimple def_stmt;
1601 enum vect_def_type dt;
1602 if (!op || op == lhs)
1603 continue;
1604 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1605 STMT_VINFO_BB_VINFO (stmt_info),
1606 &def_stmt, &def, &dt))
1607 {
1608 /* Without looking at the actual initializer a vector of
1609 constants can be implemented as load from the constant pool.
1610 ??? We need to pass down stmt_info for a vector type
1611 even if it points to the wrong stmt. */
1612 if (dt == vect_constant_def)
1613 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1614 stmt_info, 0, vect_prologue);
1615 else if (dt == vect_external_def)
1616 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1617 stmt_info, 0, vect_prologue);
1618 }
1619 }
1620 }
1621
1622 /* Compute the cost for the SLP instance INSTANCE. */
1623
1624 static void
1625 vect_analyze_slp_cost (slp_instance instance, void *data)
1626 {
1627 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1628 unsigned ncopies_for_cost;
1629 stmt_info_for_cost *si;
1630 unsigned i;
1631
1632 /* Calculate the number of vector stmts to create based on the unrolling
1633 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1634 GROUP_SIZE / NUNITS otherwise. */
1635 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1636 slp_tree node = SLP_INSTANCE_TREE (instance);
1637 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1638 /* Adjust the group_size by the vectorization factor which is always one
1639 for basic-block vectorization. */
1640 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1641 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1642 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1643 /* For reductions look at a reduction operand in case the reduction
1644 operation is widening like DOT_PROD or SAD. */
1645 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1646 {
1647 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1648 switch (gimple_assign_rhs_code (stmt))
1649 {
1650 case DOT_PROD_EXPR:
1651 case SAD_EXPR:
1652 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1653 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1654 break;
1655 default:;
1656 }
1657 }
1658 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1659
1660 prologue_cost_vec.create (10);
1661 body_cost_vec.create (10);
1662 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1663 &prologue_cost_vec, &body_cost_vec,
1664 ncopies_for_cost);
1665
1666 /* Record the prologue costs, which were delayed until we were
1667 sure that SLP was successful. */
1668 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1669 {
1670 struct _stmt_vec_info *stmt_info
1671 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1672 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1673 si->misalign, vect_prologue);
1674 }
1675
1676 /* Record the instance's instructions in the target cost model. */
1677 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1678 {
1679 struct _stmt_vec_info *stmt_info
1680 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1681 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1682 si->misalign, vect_body);
1683 }
1684
1685 prologue_cost_vec.release ();
1686 body_cost_vec.release ();
1687 }
1688
1689 /* Analyze an SLP instance starting from a group of grouped stores. Call
1690 vect_build_slp_tree to build a tree of packed stmts if possible.
1691 Return FALSE if it's impossible to SLP any stmt in the loop. */
1692
1693 static bool
1694 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1695 gimple stmt, unsigned max_tree_size)
1696 {
1697 slp_instance new_instance;
1698 slp_tree node;
1699 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1700 unsigned int unrolling_factor = 1, nunits;
1701 tree vectype, scalar_type = NULL_TREE;
1702 gimple next;
1703 unsigned int vectorization_factor = 0;
1704 int i;
1705 unsigned int max_nunits = 0;
1706 vec<slp_tree> loads;
1707 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1708 vec<gimple> scalar_stmts;
1709
1710 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1711 {
1712 if (dr)
1713 {
1714 scalar_type = TREE_TYPE (DR_REF (dr));
1715 vectype = get_vectype_for_scalar_type (scalar_type);
1716 }
1717 else
1718 {
1719 gcc_assert (loop_vinfo);
1720 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1721 }
1722
1723 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1724 }
1725 else
1726 {
1727 gcc_assert (loop_vinfo);
1728 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1729 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1730 }
1731
1732 if (!vectype)
1733 {
1734 if (dump_enabled_p ())
1735 {
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "Build SLP failed: unsupported data-type ");
1738 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1739 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1740 }
1741
1742 return false;
1743 }
1744
1745 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1746 if (loop_vinfo)
1747 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1748 else
1749 vectorization_factor = nunits;
1750
1751 /* Calculate the unrolling factor. */
1752 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1753 if (unrolling_factor != 1 && !loop_vinfo)
1754 {
1755 if (dump_enabled_p ())
1756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1757 "Build SLP failed: unrolling required in basic"
1758 " block SLP\n");
1759
1760 return false;
1761 }
1762
1763 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1764 scalar_stmts.create (group_size);
1765 next = stmt;
1766 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1767 {
1768 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1769 while (next)
1770 {
1771 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1772 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1773 scalar_stmts.safe_push (
1774 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1775 else
1776 scalar_stmts.safe_push (next);
1777 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1778 }
1779 /* Mark the first element of the reduction chain as reduction to properly
1780 transform the node. In the reduction analysis phase only the last
1781 element of the chain is marked as reduction. */
1782 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1783 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1784 }
1785 else
1786 {
1787 /* Collect reduction statements. */
1788 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1789 for (i = 0; reductions.iterate (i, &next); i++)
1790 scalar_stmts.safe_push (next);
1791 }
1792
1793 node = vect_create_new_slp_node (scalar_stmts);
1794
1795 loads.create (group_size);
1796
1797 /* Build the tree for the SLP instance. */
1798 bool *matches = XALLOCAVEC (bool, group_size);
1799 unsigned npermutes = 0;
1800 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1801 &max_nunits, &loads,
1802 vectorization_factor, matches, &npermutes, NULL,
1803 max_tree_size))
1804 {
1805 /* Calculate the unrolling factor based on the smallest type. */
1806 if (max_nunits > nunits)
1807 unrolling_factor = least_common_multiple (max_nunits, group_size)
1808 / group_size;
1809
1810 if (unrolling_factor != 1 && !loop_vinfo)
1811 {
1812 if (dump_enabled_p ())
1813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1814 "Build SLP failed: unrolling required in basic"
1815 " block SLP\n");
1816 vect_free_slp_tree (node);
1817 loads.release ();
1818 return false;
1819 }
1820
1821 /* Create a new SLP instance. */
1822 new_instance = XNEW (struct _slp_instance);
1823 SLP_INSTANCE_TREE (new_instance) = node;
1824 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1825 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1826 SLP_INSTANCE_LOADS (new_instance) = loads;
1827
1828 /* Compute the load permutation. */
1829 slp_tree load_node;
1830 bool loads_permuted = false;
1831 FOR_EACH_VEC_ELT (loads, i, load_node)
1832 {
1833 vec<unsigned> load_permutation;
1834 int j;
1835 gimple load, first_stmt;
1836 bool this_load_permuted = false;
1837 load_permutation.create (group_size);
1838 first_stmt = GROUP_FIRST_ELEMENT
1839 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1840 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1841 {
1842 int load_place
1843 = vect_get_place_in_interleaving_chain (load, first_stmt);
1844 gcc_assert (load_place != -1);
1845 if (load_place != j)
1846 this_load_permuted = true;
1847 load_permutation.safe_push (load_place);
1848 }
1849 if (!this_load_permuted)
1850 {
1851 load_permutation.release ();
1852 continue;
1853 }
1854 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1855 loads_permuted = true;
1856 }
1857
1858 if (loads_permuted)
1859 {
1860 if (!vect_supported_load_permutation_p (new_instance))
1861 {
1862 if (dump_enabled_p ())
1863 {
1864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 "Build SLP failed: unsupported load "
1866 "permutation ");
1867 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1868 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1869 }
1870 vect_free_slp_instance (new_instance);
1871 return false;
1872 }
1873 }
1874
1875
1876 if (loop_vinfo)
1877 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1878 else
1879 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1880
1881 if (dump_enabled_p ())
1882 vect_print_slp_tree (MSG_NOTE, node);
1883
1884 return true;
1885 }
1886
1887 /* Failed to SLP. */
1888 /* Free the allocated memory. */
1889 vect_free_slp_tree (node);
1890 loads.release ();
1891
1892 return false;
1893 }
1894
1895
1896 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1897 trees of packed scalar stmts if SLP is possible. */
1898
1899 bool
1900 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1901 unsigned max_tree_size)
1902 {
1903 unsigned int i;
1904 vec<gimple> grouped_stores;
1905 vec<gimple> reductions = vNULL;
1906 vec<gimple> reduc_chains = vNULL;
1907 gimple first_element;
1908 bool ok = false;
1909
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1912
1913 if (loop_vinfo)
1914 {
1915 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1916 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1917 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1918 }
1919 else
1920 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1921
1922 /* Find SLP sequences starting from groups of grouped stores. */
1923 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1924 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1925 max_tree_size))
1926 ok = true;
1927
1928 if (reduc_chains.length () > 0)
1929 {
1930 /* Find SLP sequences starting from reduction chains. */
1931 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1932 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1933 max_tree_size))
1934 ok = true;
1935 else
1936 return false;
1937
1938 /* Don't try to vectorize SLP reductions if reduction chain was
1939 detected. */
1940 return ok;
1941 }
1942
1943 /* Find SLP sequences starting from groups of reductions. */
1944 if (reductions.length () > 1
1945 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1946 max_tree_size))
1947 ok = true;
1948
1949 return true;
1950 }
1951
1952
1953 /* For each possible SLP instance decide whether to SLP it and calculate overall
1954 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1955 least one instance. */
1956
1957 bool
1958 vect_make_slp_decision (loop_vec_info loop_vinfo)
1959 {
1960 unsigned int i, unrolling_factor = 1;
1961 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1962 slp_instance instance;
1963 int decided_to_slp = 0;
1964
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1967 "\n");
1968
1969 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1970 {
1971 /* FORNOW: SLP if you can. */
1972 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1973 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1974
1975 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1976 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1977 loop-based vectorization. Such stmts will be marked as HYBRID. */
1978 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1979 decided_to_slp++;
1980 }
1981
1982 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1983
1984 if (decided_to_slp && dump_enabled_p ())
1985 dump_printf_loc (MSG_NOTE, vect_location,
1986 "Decided to SLP %d instances. Unrolling factor %d\n",
1987 decided_to_slp, unrolling_factor);
1988
1989 return (decided_to_slp > 0);
1990 }
1991
1992
1993 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1994 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1995
1996 static void
1997 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1998 {
1999 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2000 imm_use_iterator imm_iter;
2001 gimple use_stmt;
2002 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2003 slp_tree child;
2004 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2005 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2006 int j;
2007
2008 /* Propagate hybrid down the SLP tree. */
2009 if (stype == hybrid)
2010 ;
2011 else if (HYBRID_SLP_STMT (stmt_vinfo))
2012 stype = hybrid;
2013 else
2014 {
2015 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2016 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2017 /* We always get the pattern stmt here, but for immediate
2018 uses we have to use the LHS of the original stmt. */
2019 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2020 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
2021 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2022 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2023 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2024 {
2025 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2026 continue;
2027 use_vinfo = vinfo_for_stmt (use_stmt);
2028 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2029 && STMT_VINFO_RELATED_STMT (use_vinfo))
2030 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2031 if (!STMT_SLP_TYPE (use_vinfo)
2032 && (STMT_VINFO_RELEVANT (use_vinfo)
2033 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2034 && !(gimple_code (use_stmt) == GIMPLE_PHI
2035 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2036 stype = hybrid;
2037 }
2038 }
2039
2040 if (stype == hybrid)
2041 {
2042 if (dump_enabled_p ())
2043 {
2044 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2045 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2046 }
2047 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2048 }
2049
2050 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2051 if (child)
2052 vect_detect_hybrid_slp_stmts (child, i, stype);
2053 }
2054
2055 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2056
2057 static tree
2058 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2059 {
2060 walk_stmt_info *wi = (walk_stmt_info *)data;
2061 struct loop *loopp = (struct loop *)wi->info;
2062
2063 if (wi->is_lhs)
2064 return NULL_TREE;
2065
2066 if (TREE_CODE (*tp) == SSA_NAME
2067 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2068 {
2069 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2070 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2071 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2072 {
2073 if (dump_enabled_p ())
2074 {
2075 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2076 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2077 }
2078 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2079 }
2080 }
2081
2082 return NULL_TREE;
2083 }
2084
2085 static tree
2086 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2087 walk_stmt_info *)
2088 {
2089 /* If the stmt is in a SLP instance then this isn't a reason
2090 to mark use definitions in other SLP instances as hybrid. */
2091 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2092 *handled = true;
2093 return NULL_TREE;
2094 }
2095
2096 /* Find stmts that must be both vectorized and SLPed. */
2097
2098 void
2099 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2100 {
2101 unsigned int i;
2102 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2103 slp_instance instance;
2104
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2107 "\n");
2108
2109 /* First walk all pattern stmt in the loop and mark defs of uses as
2110 hybrid because immediate uses in them are not recorded. */
2111 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2112 {
2113 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2114 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2115 gsi_next (&gsi))
2116 {
2117 gimple stmt = gsi_stmt (gsi);
2118 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2119 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2120 {
2121 walk_stmt_info wi;
2122 memset (&wi, 0, sizeof (wi));
2123 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2124 gimple_stmt_iterator gsi2
2125 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2126 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2127 vect_detect_hybrid_slp_1, &wi);
2128 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2129 vect_detect_hybrid_slp_2,
2130 vect_detect_hybrid_slp_1, &wi);
2131 }
2132 }
2133 }
2134
2135 /* Then walk the SLP instance trees marking stmts with uses in
2136 non-SLP stmts as hybrid, also propagating hybrid down the
2137 SLP tree, collecting the above info on-the-fly. */
2138 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2139 {
2140 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2141 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2142 i, pure_slp);
2143 }
2144 }
2145
2146
2147 /* Create and initialize a new bb_vec_info struct for BB, as well as
2148 stmt_vec_info structs for all the stmts in it. */
2149
2150 static bb_vec_info
2151 new_bb_vec_info (basic_block bb)
2152 {
2153 bb_vec_info res = NULL;
2154 gimple_stmt_iterator gsi;
2155
2156 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2157 BB_VINFO_BB (res) = bb;
2158
2159 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2160 {
2161 gimple stmt = gsi_stmt (gsi);
2162 gimple_set_uid (stmt, 0);
2163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2164 }
2165
2166 BB_VINFO_GROUPED_STORES (res).create (10);
2167 BB_VINFO_SLP_INSTANCES (res).create (2);
2168 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2169
2170 bb->aux = res;
2171 return res;
2172 }
2173
2174
2175 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2176 stmts in the basic block. */
2177
2178 static void
2179 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2180 {
2181 vec<slp_instance> slp_instances;
2182 slp_instance instance;
2183 basic_block bb;
2184 gimple_stmt_iterator si;
2185 unsigned i;
2186
2187 if (!bb_vinfo)
2188 return;
2189
2190 bb = BB_VINFO_BB (bb_vinfo);
2191
2192 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2193 {
2194 gimple stmt = gsi_stmt (si);
2195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2196
2197 if (stmt_info)
2198 /* Free stmt_vec_info. */
2199 free_stmt_vec_info (stmt);
2200 }
2201
2202 vect_destroy_datarefs (NULL, bb_vinfo);
2203 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2204 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2205 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2206 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2207 vect_free_slp_instance (instance);
2208 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2209 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2210 free (bb_vinfo);
2211 bb->aux = NULL;
2212 }
2213
2214
2215 /* Analyze statements contained in SLP tree node after recursively analyzing
2216 the subtree. Return TRUE if the operations are supported. */
2217
2218 static bool
2219 vect_slp_analyze_node_operations (slp_tree node)
2220 {
2221 bool dummy;
2222 int i;
2223 gimple stmt;
2224 slp_tree child;
2225
2226 if (!node)
2227 return true;
2228
2229 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2230 if (!vect_slp_analyze_node_operations (child))
2231 return false;
2232
2233 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2234 {
2235 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2236 gcc_assert (stmt_info);
2237 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2238
2239 if (!vect_analyze_stmt (stmt, &dummy, node))
2240 return false;
2241 }
2242
2243 return true;
2244 }
2245
2246
2247 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2248 operations are supported. */
2249
2250 bool
2251 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2252 {
2253 slp_instance instance;
2254 int i;
2255
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE, vect_location,
2258 "=== vect_slp_analyze_operations ===\n");
2259
2260 for (i = 0; slp_instances.iterate (i, &instance); )
2261 {
2262 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2263 {
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "removing SLP instance operations starting from: ");
2266 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2267 SLP_TREE_SCALAR_STMTS
2268 (SLP_INSTANCE_TREE (instance))[0], 0);
2269 vect_free_slp_instance (instance);
2270 slp_instances.ordered_remove (i);
2271 }
2272 else
2273 {
2274 /* Compute the costs of the SLP instance. */
2275 vect_analyze_slp_cost (instance, data);
2276 i++;
2277 }
2278 }
2279
2280 if (!slp_instances.length ())
2281 return false;
2282
2283 return true;
2284 }
2285
2286
2287 /* Compute the scalar cost of the SLP node NODE and its children
2288 and return it. Do not account defs that are marked in LIFE and
2289 update LIFE according to uses of NODE. */
2290
2291 static unsigned
2292 vect_bb_slp_scalar_cost (basic_block bb,
2293 slp_tree node, vec<bool, va_heap> *life)
2294 {
2295 unsigned scalar_cost = 0;
2296 unsigned i;
2297 gimple stmt;
2298 slp_tree child;
2299
2300 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2301 {
2302 unsigned stmt_cost;
2303 ssa_op_iter op_iter;
2304 def_operand_p def_p;
2305 stmt_vec_info stmt_info;
2306
2307 if ((*life)[i])
2308 continue;
2309
2310 /* If there is a non-vectorized use of the defs then the scalar
2311 stmt is kept live in which case we do not account it or any
2312 required defs in the SLP children in the scalar cost. This
2313 way we make the vectorization more costly when compared to
2314 the scalar cost. */
2315 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2316 {
2317 imm_use_iterator use_iter;
2318 gimple use_stmt;
2319 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2320 if (!is_gimple_debug (use_stmt)
2321 && (gimple_code (use_stmt) == GIMPLE_PHI
2322 || gimple_bb (use_stmt) != bb
2323 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2324 {
2325 (*life)[i] = true;
2326 BREAK_FROM_IMM_USE_STMT (use_iter);
2327 }
2328 }
2329 if ((*life)[i])
2330 continue;
2331
2332 stmt_info = vinfo_for_stmt (stmt);
2333 if (STMT_VINFO_DATA_REF (stmt_info))
2334 {
2335 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2336 stmt_cost = vect_get_stmt_cost (scalar_load);
2337 else
2338 stmt_cost = vect_get_stmt_cost (scalar_store);
2339 }
2340 else
2341 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2342
2343 scalar_cost += stmt_cost;
2344 }
2345
2346 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2347 if (child)
2348 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2349
2350 return scalar_cost;
2351 }
2352
2353 /* Check if vectorization of the basic block is profitable. */
2354
2355 static bool
2356 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2357 {
2358 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2359 slp_instance instance;
2360 int i;
2361 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2362 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2363
2364 /* Calculate scalar cost. */
2365 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2366 {
2367 auto_vec<bool, 20> life;
2368 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2369 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2370 SLP_INSTANCE_TREE (instance),
2371 &life);
2372 }
2373
2374 /* Complete the target-specific cost calculation. */
2375 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2376 &vec_inside_cost, &vec_epilogue_cost);
2377
2378 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2379
2380 if (dump_enabled_p ())
2381 {
2382 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2383 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2384 vec_inside_cost);
2385 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2386 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2387 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2388 }
2389
2390 /* Vectorization is profitable if its cost is less than the cost of scalar
2391 version. */
2392 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2393 return false;
2394
2395 return true;
2396 }
2397
2398 /* Check if the basic block can be vectorized. */
2399
2400 static bb_vec_info
2401 vect_slp_analyze_bb_1 (basic_block bb)
2402 {
2403 bb_vec_info bb_vinfo;
2404 vec<slp_instance> slp_instances;
2405 slp_instance instance;
2406 int i;
2407 int min_vf = 2;
2408 unsigned n_stmts = 0;
2409
2410 bb_vinfo = new_bb_vec_info (bb);
2411 if (!bb_vinfo)
2412 return NULL;
2413
2414 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2415 {
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2418 "not vectorized: unhandled data-ref in basic "
2419 "block.\n");
2420
2421 destroy_bb_vec_info (bb_vinfo);
2422 return NULL;
2423 }
2424
2425 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2426 {
2427 if (dump_enabled_p ())
2428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2429 "not vectorized: not enough data-refs in "
2430 "basic block.\n");
2431
2432 destroy_bb_vec_info (bb_vinfo);
2433 return NULL;
2434 }
2435
2436 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2437 {
2438 if (dump_enabled_p ())
2439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2440 "not vectorized: unhandled data access in "
2441 "basic block.\n");
2442
2443 destroy_bb_vec_info (bb_vinfo);
2444 return NULL;
2445 }
2446
2447 vect_pattern_recog (NULL, bb_vinfo);
2448
2449 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2450 {
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "not vectorized: bad data alignment in basic "
2454 "block.\n");
2455
2456 destroy_bb_vec_info (bb_vinfo);
2457 return NULL;
2458 }
2459
2460 /* Check the SLP opportunities in the basic block, analyze and build SLP
2461 trees. */
2462 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2463 {
2464 if (dump_enabled_p ())
2465 {
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467 "Failed to SLP the basic block.\n");
2468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2469 "not vectorized: failed to find SLP opportunities "
2470 "in basic block.\n");
2471 }
2472
2473 destroy_bb_vec_info (bb_vinfo);
2474 return NULL;
2475 }
2476
2477 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2478
2479 /* Mark all the statements that we want to vectorize as pure SLP and
2480 relevant. */
2481 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2482 {
2483 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2484 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2485 }
2486
2487 /* Mark all the statements that we do not want to vectorize. */
2488 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2489 !gsi_end_p (gsi); gsi_next (&gsi))
2490 {
2491 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2492 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2493 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2494 }
2495
2496 /* Analyze dependences. At this point all stmts not participating in
2497 vectorization have to be marked. Dependence analysis assumes
2498 that we either vectorize all SLP instances or none at all. */
2499 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2500 {
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503 "not vectorized: unhandled data dependence "
2504 "in basic block.\n");
2505
2506 destroy_bb_vec_info (bb_vinfo);
2507 return NULL;
2508 }
2509
2510 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2511 {
2512 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "not vectorized: unsupported alignment in basic "
2515 "block.\n");
2516 destroy_bb_vec_info (bb_vinfo);
2517 return NULL;
2518 }
2519
2520 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2521 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2522 {
2523 if (dump_enabled_p ())
2524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2525 "not vectorized: bad operation in basic block.\n");
2526
2527 destroy_bb_vec_info (bb_vinfo);
2528 return NULL;
2529 }
2530
2531 /* Cost model: check if the vectorization is worthwhile. */
2532 if (!unlimited_cost_model (NULL)
2533 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2534 {
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537 "not vectorized: vectorization is not "
2538 "profitable.\n");
2539
2540 destroy_bb_vec_info (bb_vinfo);
2541 return NULL;
2542 }
2543
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location,
2546 "Basic block will be vectorized using SLP\n");
2547
2548 return bb_vinfo;
2549 }
2550
2551
2552 bb_vec_info
2553 vect_slp_analyze_bb (basic_block bb)
2554 {
2555 bb_vec_info bb_vinfo;
2556 int insns = 0;
2557 gimple_stmt_iterator gsi;
2558 unsigned int vector_sizes;
2559
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2562
2563 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2564 {
2565 gimple stmt = gsi_stmt (gsi);
2566 if (!is_gimple_debug (stmt)
2567 && !gimple_nop_p (stmt)
2568 && gimple_code (stmt) != GIMPLE_LABEL)
2569 insns++;
2570 }
2571
2572 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2573 {
2574 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2576 "not vectorized: too many instructions in "
2577 "basic block.\n");
2578
2579 return NULL;
2580 }
2581
2582 /* Autodetect first vector size we try. */
2583 current_vector_size = 0;
2584 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2585
2586 while (1)
2587 {
2588 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2589 if (bb_vinfo)
2590 return bb_vinfo;
2591
2592 destroy_bb_vec_info (bb_vinfo);
2593
2594 vector_sizes &= ~current_vector_size;
2595 if (vector_sizes == 0
2596 || current_vector_size == 0)
2597 return NULL;
2598
2599 /* Try the next biggest vector size. */
2600 current_vector_size = 1 << floor_log2 (vector_sizes);
2601 if (dump_enabled_p ())
2602 dump_printf_loc (MSG_NOTE, vect_location,
2603 "***** Re-trying analysis with "
2604 "vector size %d\n", current_vector_size);
2605 }
2606 }
2607
2608
2609 /* For constant and loop invariant defs of SLP_NODE this function returns
2610 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2611 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2612 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2613 REDUC_INDEX is the index of the reduction operand in the statements, unless
2614 it is -1. */
2615
2616 static void
2617 vect_get_constant_vectors (tree op, slp_tree slp_node,
2618 vec<tree> *vec_oprnds,
2619 unsigned int op_num, unsigned int number_of_vectors,
2620 int reduc_index)
2621 {
2622 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2623 gimple stmt = stmts[0];
2624 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2625 unsigned nunits;
2626 tree vec_cst;
2627 tree *elts;
2628 unsigned j, number_of_places_left_in_vector;
2629 tree vector_type;
2630 tree vop;
2631 int group_size = stmts.length ();
2632 unsigned int vec_num, i;
2633 unsigned number_of_copies = 1;
2634 vec<tree> voprnds;
2635 voprnds.create (number_of_vectors);
2636 bool constant_p, is_store;
2637 tree neutral_op = NULL;
2638 enum tree_code code = gimple_expr_code (stmt);
2639 gimple def_stmt;
2640 struct loop *loop;
2641 gimple_seq ctor_seq = NULL;
2642
2643 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2644 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2645
2646 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2647 && reduc_index != -1)
2648 {
2649 op_num = reduc_index;
2650 op = gimple_op (stmt, op_num + 1);
2651 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2652 we need either neutral operands or the original operands. See
2653 get_initial_def_for_reduction() for details. */
2654 switch (code)
2655 {
2656 case WIDEN_SUM_EXPR:
2657 case DOT_PROD_EXPR:
2658 case SAD_EXPR:
2659 case PLUS_EXPR:
2660 case MINUS_EXPR:
2661 case BIT_IOR_EXPR:
2662 case BIT_XOR_EXPR:
2663 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2664 neutral_op = build_real (TREE_TYPE (op), dconst0);
2665 else
2666 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2667
2668 break;
2669
2670 case MULT_EXPR:
2671 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2672 neutral_op = build_real (TREE_TYPE (op), dconst1);
2673 else
2674 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2675
2676 break;
2677
2678 case BIT_AND_EXPR:
2679 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2680 break;
2681
2682 /* For MIN/MAX we don't have an easy neutral operand but
2683 the initial values can be used fine here. Only for
2684 a reduction chain we have to force a neutral element. */
2685 case MAX_EXPR:
2686 case MIN_EXPR:
2687 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2688 neutral_op = NULL;
2689 else
2690 {
2691 def_stmt = SSA_NAME_DEF_STMT (op);
2692 loop = (gimple_bb (stmt))->loop_father;
2693 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2694 loop_preheader_edge (loop));
2695 }
2696 break;
2697
2698 default:
2699 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2700 neutral_op = NULL;
2701 }
2702 }
2703
2704 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2705 {
2706 is_store = true;
2707 op = gimple_assign_rhs1 (stmt);
2708 }
2709 else
2710 is_store = false;
2711
2712 gcc_assert (op);
2713
2714 if (CONSTANT_CLASS_P (op))
2715 constant_p = true;
2716 else
2717 constant_p = false;
2718
2719 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2720 created vectors. It is greater than 1 if unrolling is performed.
2721
2722 For example, we have two scalar operands, s1 and s2 (e.g., group of
2723 strided accesses of size two), while NUNITS is four (i.e., four scalars
2724 of this type can be packed in a vector). The output vector will contain
2725 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2726 will be 2).
2727
2728 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2729 containing the operands.
2730
2731 For example, NUNITS is four as before, and the group size is 8
2732 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2733 {s5, s6, s7, s8}. */
2734
2735 number_of_copies = nunits * number_of_vectors / group_size;
2736
2737 number_of_places_left_in_vector = nunits;
2738 elts = XALLOCAVEC (tree, nunits);
2739 bool place_after_defs = false;
2740 for (j = 0; j < number_of_copies; j++)
2741 {
2742 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2743 {
2744 if (is_store)
2745 op = gimple_assign_rhs1 (stmt);
2746 else
2747 {
2748 switch (code)
2749 {
2750 case COND_EXPR:
2751 if (op_num == 0 || op_num == 1)
2752 {
2753 tree cond = gimple_assign_rhs1 (stmt);
2754 op = TREE_OPERAND (cond, op_num);
2755 }
2756 else
2757 {
2758 if (op_num == 2)
2759 op = gimple_assign_rhs2 (stmt);
2760 else
2761 op = gimple_assign_rhs3 (stmt);
2762 }
2763 break;
2764
2765 case CALL_EXPR:
2766 op = gimple_call_arg (stmt, op_num);
2767 break;
2768
2769 case LSHIFT_EXPR:
2770 case RSHIFT_EXPR:
2771 case LROTATE_EXPR:
2772 case RROTATE_EXPR:
2773 op = gimple_op (stmt, op_num + 1);
2774 /* Unlike the other binary operators, shifts/rotates have
2775 the shift count being int, instead of the same type as
2776 the lhs, so make sure the scalar is the right type if
2777 we are dealing with vectors of
2778 long long/long/short/char. */
2779 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2780 op = fold_convert (TREE_TYPE (vector_type), op);
2781 break;
2782
2783 default:
2784 op = gimple_op (stmt, op_num + 1);
2785 break;
2786 }
2787 }
2788
2789 if (reduc_index != -1)
2790 {
2791 loop = (gimple_bb (stmt))->loop_father;
2792 def_stmt = SSA_NAME_DEF_STMT (op);
2793
2794 gcc_assert (loop);
2795
2796 /* Get the def before the loop. In reduction chain we have only
2797 one initial value. */
2798 if ((j != (number_of_copies - 1)
2799 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2800 && i != 0))
2801 && neutral_op)
2802 op = neutral_op;
2803 else
2804 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2805 loop_preheader_edge (loop));
2806 }
2807
2808 /* Create 'vect_ = {op0,op1,...,opn}'. */
2809 number_of_places_left_in_vector--;
2810 tree orig_op = op;
2811 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2812 {
2813 if (CONSTANT_CLASS_P (op))
2814 {
2815 op = fold_unary (VIEW_CONVERT_EXPR,
2816 TREE_TYPE (vector_type), op);
2817 gcc_assert (op && CONSTANT_CLASS_P (op));
2818 }
2819 else
2820 {
2821 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2822 gimple init_stmt;
2823 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2824 init_stmt
2825 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2826 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2827 op = new_temp;
2828 }
2829 }
2830 elts[number_of_places_left_in_vector] = op;
2831 if (!CONSTANT_CLASS_P (op))
2832 constant_p = false;
2833 if (TREE_CODE (orig_op) == SSA_NAME
2834 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2835 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2836 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2837 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2838 place_after_defs = true;
2839
2840 if (number_of_places_left_in_vector == 0)
2841 {
2842 number_of_places_left_in_vector = nunits;
2843
2844 if (constant_p)
2845 vec_cst = build_vector (vector_type, elts);
2846 else
2847 {
2848 vec<constructor_elt, va_gc> *v;
2849 unsigned k;
2850 vec_alloc (v, nunits);
2851 for (k = 0; k < nunits; ++k)
2852 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2853 vec_cst = build_constructor (vector_type, v);
2854 }
2855 tree init;
2856 gimple_stmt_iterator gsi;
2857 if (place_after_defs)
2858 {
2859 gsi = gsi_for_stmt
2860 (vect_find_last_scalar_stmt_in_slp (slp_node));
2861 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2862 }
2863 else
2864 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2865 if (ctor_seq != NULL)
2866 {
2867 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2868 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2869 GSI_SAME_STMT);
2870 ctor_seq = NULL;
2871 }
2872 voprnds.quick_push (init);
2873 place_after_defs = false;
2874 }
2875 }
2876 }
2877
2878 /* Since the vectors are created in the reverse order, we should invert
2879 them. */
2880 vec_num = voprnds.length ();
2881 for (j = vec_num; j != 0; j--)
2882 {
2883 vop = voprnds[j - 1];
2884 vec_oprnds->quick_push (vop);
2885 }
2886
2887 voprnds.release ();
2888
2889 /* In case that VF is greater than the unrolling factor needed for the SLP
2890 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2891 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2892 to replicate the vectors. */
2893 while (number_of_vectors > vec_oprnds->length ())
2894 {
2895 tree neutral_vec = NULL;
2896
2897 if (neutral_op)
2898 {
2899 if (!neutral_vec)
2900 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2901
2902 vec_oprnds->quick_push (neutral_vec);
2903 }
2904 else
2905 {
2906 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2907 vec_oprnds->quick_push (vop);
2908 }
2909 }
2910 }
2911
2912
2913 /* Get vectorized definitions from SLP_NODE that contains corresponding
2914 vectorized def-stmts. */
2915
2916 static void
2917 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2918 {
2919 tree vec_oprnd;
2920 gimple vec_def_stmt;
2921 unsigned int i;
2922
2923 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2924
2925 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2926 {
2927 gcc_assert (vec_def_stmt);
2928 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2929 vec_oprnds->quick_push (vec_oprnd);
2930 }
2931 }
2932
2933
2934 /* Get vectorized definitions for SLP_NODE.
2935 If the scalar definitions are loop invariants or constants, collect them and
2936 call vect_get_constant_vectors() to create vector stmts.
2937 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2938 must be stored in the corresponding child of SLP_NODE, and we call
2939 vect_get_slp_vect_defs () to retrieve them. */
2940
2941 void
2942 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2943 vec<vec<tree> > *vec_oprnds, int reduc_index)
2944 {
2945 gimple first_stmt;
2946 int number_of_vects = 0, i;
2947 unsigned int child_index = 0;
2948 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2949 slp_tree child = NULL;
2950 vec<tree> vec_defs;
2951 tree oprnd;
2952 bool vectorized_defs;
2953
2954 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2955 FOR_EACH_VEC_ELT (ops, i, oprnd)
2956 {
2957 /* For each operand we check if it has vectorized definitions in a child
2958 node or we need to create them (for invariants and constants). We
2959 check if the LHS of the first stmt of the next child matches OPRND.
2960 If it does, we found the correct child. Otherwise, we call
2961 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2962 to check this child node for the next operand. */
2963 vectorized_defs = false;
2964 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2965 {
2966 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2967
2968 /* We have to check both pattern and original def, if available. */
2969 if (child)
2970 {
2971 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2972 gimple related
2973 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2974
2975 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2976 || (related
2977 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2978 {
2979 /* The number of vector defs is determined by the number of
2980 vector statements in the node from which we get those
2981 statements. */
2982 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2983 vectorized_defs = true;
2984 child_index++;
2985 }
2986 }
2987 else
2988 child_index++;
2989 }
2990
2991 if (!vectorized_defs)
2992 {
2993 if (i == 0)
2994 {
2995 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2996 /* Number of vector stmts was calculated according to LHS in
2997 vect_schedule_slp_instance (), fix it by replacing LHS with
2998 RHS, if necessary. See vect_get_smallest_scalar_type () for
2999 details. */
3000 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3001 &rhs_size_unit);
3002 if (rhs_size_unit != lhs_size_unit)
3003 {
3004 number_of_vects *= rhs_size_unit;
3005 number_of_vects /= lhs_size_unit;
3006 }
3007 }
3008 }
3009
3010 /* Allocate memory for vectorized defs. */
3011 vec_defs = vNULL;
3012 vec_defs.create (number_of_vects);
3013
3014 /* For reduction defs we call vect_get_constant_vectors (), since we are
3015 looking for initial loop invariant values. */
3016 if (vectorized_defs && reduc_index == -1)
3017 /* The defs are already vectorized. */
3018 vect_get_slp_vect_defs (child, &vec_defs);
3019 else
3020 /* Build vectors from scalar defs. */
3021 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3022 number_of_vects, reduc_index);
3023
3024 vec_oprnds->quick_push (vec_defs);
3025
3026 /* For reductions, we only need initial values. */
3027 if (reduc_index != -1)
3028 return;
3029 }
3030 }
3031
3032
3033 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3034 building a vector of type MASK_TYPE from it) and two input vectors placed in
3035 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3036 shifting by STRIDE elements of DR_CHAIN for every copy.
3037 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3038 copies).
3039 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3040 the created stmts must be inserted. */
3041
3042 static inline void
3043 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3044 tree mask, int first_vec_indx, int second_vec_indx,
3045 gimple_stmt_iterator *gsi, slp_tree node,
3046 tree vectype, vec<tree> dr_chain,
3047 int ncopies, int vect_stmts_counter)
3048 {
3049 tree perm_dest;
3050 gimple perm_stmt = NULL;
3051 stmt_vec_info next_stmt_info;
3052 int i, stride;
3053 tree first_vec, second_vec, data_ref;
3054
3055 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3056
3057 /* Initialize the vect stmts of NODE to properly insert the generated
3058 stmts later. */
3059 for (i = SLP_TREE_VEC_STMTS (node).length ();
3060 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3061 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3062
3063 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3064 for (i = 0; i < ncopies; i++)
3065 {
3066 first_vec = dr_chain[first_vec_indx];
3067 second_vec = dr_chain[second_vec_indx];
3068
3069 /* Generate the permute statement. */
3070 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3071 first_vec, second_vec, mask);
3072 data_ref = make_ssa_name (perm_dest, perm_stmt);
3073 gimple_set_lhs (perm_stmt, data_ref);
3074 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3075
3076 /* Store the vector statement in NODE. */
3077 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3078
3079 first_vec_indx += stride;
3080 second_vec_indx += stride;
3081 }
3082
3083 /* Mark the scalar stmt as vectorized. */
3084 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3085 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3086 }
3087
3088
3089 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3090 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3091 representation. Check that the mask is valid and return FALSE if not.
3092 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3093 the next vector, i.e., the current first vector is not needed. */
3094
3095 static bool
3096 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3097 int mask_nunits, bool only_one_vec, int index,
3098 unsigned char *mask, int *current_mask_element,
3099 bool *need_next_vector, int *number_of_mask_fixes,
3100 bool *mask_fixed, bool *needs_first_vector)
3101 {
3102 int i;
3103
3104 /* Convert to target specific representation. */
3105 *current_mask_element = first_mask_element + m;
3106 /* Adjust the value in case it's a mask for second and third vectors. */
3107 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3108
3109 if (*current_mask_element < 0)
3110 {
3111 if (dump_enabled_p ())
3112 {
3113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3114 "permutation requires past vector ");
3115 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3116 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3117 }
3118 return false;
3119 }
3120
3121 if (*current_mask_element < mask_nunits)
3122 *needs_first_vector = true;
3123
3124 /* We have only one input vector to permute but the mask accesses values in
3125 the next vector as well. */
3126 if (only_one_vec && *current_mask_element >= mask_nunits)
3127 {
3128 if (dump_enabled_p ())
3129 {
3130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3131 "permutation requires at least two vectors ");
3132 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3133 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3134 }
3135
3136 return false;
3137 }
3138
3139 /* The mask requires the next vector. */
3140 while (*current_mask_element >= mask_nunits * 2)
3141 {
3142 if (*needs_first_vector || *mask_fixed)
3143 {
3144 /* We either need the first vector too or have already moved to the
3145 next vector. In both cases, this permutation needs three
3146 vectors. */
3147 if (dump_enabled_p ())
3148 {
3149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3150 "permutation requires at "
3151 "least three vectors ");
3152 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3153 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3154 }
3155
3156 return false;
3157 }
3158
3159 /* We move to the next vector, dropping the first one and working with
3160 the second and the third - we need to adjust the values of the mask
3161 accordingly. */
3162 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3163
3164 for (i = 0; i < index; i++)
3165 mask[i] -= mask_nunits * *number_of_mask_fixes;
3166
3167 (*number_of_mask_fixes)++;
3168 *mask_fixed = true;
3169 }
3170
3171 *need_next_vector = *mask_fixed;
3172
3173 /* This was the last element of this mask. Start a new one. */
3174 if (index == mask_nunits - 1)
3175 {
3176 *number_of_mask_fixes = 1;
3177 *mask_fixed = false;
3178 *needs_first_vector = false;
3179 }
3180
3181 return true;
3182 }
3183
3184
3185 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3186 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3187 permute statements for the SLP node NODE of the SLP instance
3188 SLP_NODE_INSTANCE. */
3189
3190 bool
3191 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3192 gimple_stmt_iterator *gsi, int vf,
3193 slp_instance slp_node_instance, bool analyze_only)
3194 {
3195 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3196 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3197 tree mask_element_type = NULL_TREE, mask_type;
3198 int i, j, k, nunits, vec_index = 0, scalar_index;
3199 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3200 gimple next_scalar_stmt;
3201 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3202 int first_mask_element;
3203 int index, unroll_factor, current_mask_element, ncopies;
3204 unsigned char *mask;
3205 bool only_one_vec = false, need_next_vector = false;
3206 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3207 int number_of_mask_fixes = 1;
3208 bool mask_fixed = false;
3209 bool needs_first_vector = false;
3210 machine_mode mode;
3211
3212 mode = TYPE_MODE (vectype);
3213
3214 if (!can_vec_perm_p (mode, false, NULL))
3215 {
3216 if (dump_enabled_p ())
3217 {
3218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3219 "no vect permute for ");
3220 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3221 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3222 }
3223 return false;
3224 }
3225
3226 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3227 same size as the vector element being permuted. */
3228 mask_element_type = lang_hooks.types.type_for_mode
3229 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3230 mask_type = get_vectype_for_scalar_type (mask_element_type);
3231 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3232 mask = XALLOCAVEC (unsigned char, nunits);
3233 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3234
3235 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3236 unrolling factor. */
3237 orig_vec_stmts_num = group_size *
3238 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3239 if (orig_vec_stmts_num == 1)
3240 only_one_vec = true;
3241
3242 /* Number of copies is determined by the final vectorization factor
3243 relatively to SLP_NODE_INSTANCE unrolling factor. */
3244 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3245
3246 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3247 return false;
3248
3249 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3250
3251 /* Generate permutation masks for every NODE. Number of masks for each NODE
3252 is equal to GROUP_SIZE.
3253 E.g., we have a group of three nodes with three loads from the same
3254 location in each node, and the vector size is 4. I.e., we have a
3255 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3256 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3257 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3258 ...
3259
3260 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3261 The last mask is illegal since we assume two operands for permute
3262 operation, and the mask element values can't be outside that range.
3263 Hence, the last mask must be converted into {2,5,5,5}.
3264 For the first two permutations we need the first and the second input
3265 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3266 we need the second and the third vectors: {b1,c1,a2,b2} and
3267 {c2,a3,b3,c3}. */
3268
3269 {
3270 scalar_index = 0;
3271 index = 0;
3272 vect_stmts_counter = 0;
3273 vec_index = 0;
3274 first_vec_index = vec_index++;
3275 if (only_one_vec)
3276 second_vec_index = first_vec_index;
3277 else
3278 second_vec_index = vec_index++;
3279
3280 for (j = 0; j < unroll_factor; j++)
3281 {
3282 for (k = 0; k < group_size; k++)
3283 {
3284 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3285 first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3286 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3287 nunits, only_one_vec, index,
3288 mask, &current_mask_element,
3289 &need_next_vector,
3290 &number_of_mask_fixes, &mask_fixed,
3291 &needs_first_vector))
3292 return false;
3293 gcc_assert (current_mask_element >= 0
3294 && current_mask_element < 2 * nunits);
3295 mask[index++] = current_mask_element;
3296
3297 if (index == nunits)
3298 {
3299 index = 0;
3300 if (!can_vec_perm_p (mode, false, mask))
3301 {
3302 if (dump_enabled_p ())
3303 {
3304 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3305 vect_location,
3306 "unsupported vect permute { ");
3307 for (i = 0; i < nunits; ++i)
3308 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3309 mask[i]);
3310 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3311 }
3312 return false;
3313 }
3314
3315 if (!analyze_only)
3316 {
3317 int l;
3318 tree mask_vec, *mask_elts;
3319 mask_elts = XALLOCAVEC (tree, nunits);
3320 for (l = 0; l < nunits; ++l)
3321 mask_elts[l] = build_int_cst (mask_element_type,
3322 mask[l]);
3323 mask_vec = build_vector (mask_type, mask_elts);
3324
3325 if (need_next_vector)
3326 {
3327 first_vec_index = second_vec_index;
3328 second_vec_index = vec_index;
3329 }
3330
3331 next_scalar_stmt
3332 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3333
3334 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3335 mask_vec, first_vec_index, second_vec_index,
3336 gsi, node, vectype, dr_chain,
3337 ncopies, vect_stmts_counter++);
3338 }
3339 }
3340 }
3341 }
3342 }
3343
3344 return true;
3345 }
3346
3347
3348
3349 /* Vectorize SLP instance tree in postorder. */
3350
3351 static bool
3352 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3353 unsigned int vectorization_factor)
3354 {
3355 gimple stmt;
3356 bool grouped_store, is_store;
3357 gimple_stmt_iterator si;
3358 stmt_vec_info stmt_info;
3359 unsigned int vec_stmts_size, nunits, group_size;
3360 tree vectype;
3361 int i;
3362 slp_tree child;
3363
3364 if (!node)
3365 return false;
3366
3367 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3368 vect_schedule_slp_instance (child, instance, vectorization_factor);
3369
3370 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3371 stmt_info = vinfo_for_stmt (stmt);
3372
3373 /* VECTYPE is the type of the destination. */
3374 vectype = STMT_VINFO_VECTYPE (stmt_info);
3375 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3376 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3377
3378 /* For each SLP instance calculate number of vector stmts to be created
3379 for the scalar stmts in each node of the SLP tree. Number of vector
3380 elements in one vector iteration is the number of scalar elements in
3381 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3382 size.
3383 Unless this is a SLP reduction in which case the number of vector
3384 stmts is equal to the number of vector stmts of the children. */
3385 if (GROUP_FIRST_ELEMENT (stmt_info)
3386 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3387 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3388 else
3389 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3390
3391 if (!SLP_TREE_VEC_STMTS (node).exists ())
3392 {
3393 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3394 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3395 }
3396
3397 if (dump_enabled_p ())
3398 {
3399 dump_printf_loc (MSG_NOTE,vect_location,
3400 "------>vectorizing SLP node starting from: ");
3401 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3402 dump_printf (MSG_NOTE, "\n");
3403 }
3404
3405 /* Vectorized stmts go before the last scalar stmt which is where
3406 all uses are ready. */
3407 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3408
3409 /* Mark the first element of the reduction chain as reduction to properly
3410 transform the node. In the analysis phase only the last element of the
3411 chain is marked as reduction. */
3412 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3413 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3414 {
3415 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3416 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3417 }
3418
3419 /* Handle two-operation SLP nodes by vectorizing the group with
3420 both operations and then performing a merge. */
3421 if (SLP_TREE_TWO_OPERATORS (node))
3422 {
3423 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3424 enum tree_code ocode;
3425 gimple ostmt;
3426 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3427 bool allsame = true;
3428 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3429 if (gimple_assign_rhs_code (ostmt) != code0)
3430 {
3431 mask[i] = 1;
3432 allsame = false;
3433 ocode = gimple_assign_rhs_code (ostmt);
3434 }
3435 else
3436 mask[i] = 0;
3437 if (!allsame)
3438 {
3439 vec<gimple> v0;
3440 vec<gimple> v1;
3441 unsigned j;
3442 tree tmask = NULL_TREE;
3443 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3444 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3445 SLP_TREE_VEC_STMTS (node).truncate (0);
3446 gimple_assign_set_rhs_code (stmt, ocode);
3447 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3448 gimple_assign_set_rhs_code (stmt, code0);
3449 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3450 SLP_TREE_VEC_STMTS (node).truncate (0);
3451 tree meltype = build_nonstandard_integer_type
3452 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3453 tree mvectype = get_same_sized_vectype (meltype, vectype);
3454 unsigned k = 0, l;
3455 for (j = 0; j < v0.length (); ++j)
3456 {
3457 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3458 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3459 {
3460 if (k >= group_size)
3461 k = 0;
3462 melts[l] = build_int_cst
3463 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3464 }
3465 tmask = build_vector (mvectype, melts);
3466
3467 /* ??? Not all targets support a VEC_PERM_EXPR with a
3468 constant mask that would translate to a vec_merge RTX
3469 (with their vec_perm_const_ok). We can either not
3470 vectorize in that case or let veclower do its job.
3471 Unfortunately that isn't too great and at least for
3472 plus/minus we'd eventually like to match targets
3473 vector addsub instructions. */
3474 gimple vstmt;
3475 vstmt = gimple_build_assign (make_ssa_name (vectype),
3476 VEC_PERM_EXPR,
3477 gimple_assign_lhs (v0[j]),
3478 gimple_assign_lhs (v1[j]), tmask);
3479 vect_finish_stmt_generation (stmt, vstmt, &si);
3480 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3481 }
3482 v0.release ();
3483 v1.release ();
3484 return false;
3485 }
3486 }
3487 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3488 return is_store;
3489 }
3490
3491 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3492 For loop vectorization this is done in vectorizable_call, but for SLP
3493 it needs to be deferred until end of vect_schedule_slp, because multiple
3494 SLP instances may refer to the same scalar stmt. */
3495
3496 static void
3497 vect_remove_slp_scalar_calls (slp_tree node)
3498 {
3499 gimple stmt, new_stmt;
3500 gimple_stmt_iterator gsi;
3501 int i;
3502 slp_tree child;
3503 tree lhs;
3504 stmt_vec_info stmt_info;
3505
3506 if (!node)
3507 return;
3508
3509 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3510 vect_remove_slp_scalar_calls (child);
3511
3512 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3513 {
3514 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3515 continue;
3516 stmt_info = vinfo_for_stmt (stmt);
3517 if (stmt_info == NULL
3518 || is_pattern_stmt_p (stmt_info)
3519 || !PURE_SLP_STMT (stmt_info))
3520 continue;
3521 lhs = gimple_call_lhs (stmt);
3522 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3523 set_vinfo_for_stmt (new_stmt, stmt_info);
3524 set_vinfo_for_stmt (stmt, NULL);
3525 STMT_VINFO_STMT (stmt_info) = new_stmt;
3526 gsi = gsi_for_stmt (stmt);
3527 gsi_replace (&gsi, new_stmt, false);
3528 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3529 }
3530 }
3531
3532 /* Generate vector code for all SLP instances in the loop/basic block. */
3533
3534 bool
3535 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3536 {
3537 vec<slp_instance> slp_instances;
3538 slp_instance instance;
3539 unsigned int i, vf;
3540 bool is_store = false;
3541
3542 if (loop_vinfo)
3543 {
3544 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3545 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3546 }
3547 else
3548 {
3549 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3550 vf = 1;
3551 }
3552
3553 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3554 {
3555 /* Schedule the tree of INSTANCE. */
3556 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3557 instance, vf);
3558 if (dump_enabled_p ())
3559 dump_printf_loc (MSG_NOTE, vect_location,
3560 "vectorizing stmts using SLP.\n");
3561 }
3562
3563 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3564 {
3565 slp_tree root = SLP_INSTANCE_TREE (instance);
3566 gimple store;
3567 unsigned int j;
3568 gimple_stmt_iterator gsi;
3569
3570 /* Remove scalar call stmts. Do not do this for basic-block
3571 vectorization as not all uses may be vectorized.
3572 ??? Why should this be necessary? DCE should be able to
3573 remove the stmts itself.
3574 ??? For BB vectorization we can as well remove scalar
3575 stmts starting from the SLP tree root if they have no
3576 uses. */
3577 if (loop_vinfo)
3578 vect_remove_slp_scalar_calls (root);
3579
3580 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3581 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3582 {
3583 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3584 break;
3585
3586 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3587 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3588 /* Free the attached stmt_vec_info and remove the stmt. */
3589 gsi = gsi_for_stmt (store);
3590 unlink_stmt_vdef (store);
3591 gsi_remove (&gsi, true);
3592 release_defs (store);
3593 free_stmt_vec_info (store);
3594 }
3595 }
3596
3597 return is_store;
3598 }
3599
3600
3601 /* Vectorize the basic block. */
3602
3603 void
3604 vect_slp_transform_bb (basic_block bb)
3605 {
3606 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3607 gimple_stmt_iterator si;
3608
3609 gcc_assert (bb_vinfo);
3610
3611 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3613
3614 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3615 {
3616 gimple stmt = gsi_stmt (si);
3617 stmt_vec_info stmt_info;
3618
3619 if (dump_enabled_p ())
3620 {
3621 dump_printf_loc (MSG_NOTE, vect_location,
3622 "------>SLPing statement: ");
3623 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3624 dump_printf (MSG_NOTE, "\n");
3625 }
3626
3627 stmt_info = vinfo_for_stmt (stmt);
3628 gcc_assert (stmt_info);
3629
3630 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3631 if (STMT_SLP_TYPE (stmt_info))
3632 {
3633 vect_schedule_slp (NULL, bb_vinfo);
3634 break;
3635 }
3636 }
3637
3638 if (dump_enabled_p ())
3639 dump_printf_loc (MSG_NOTE, vect_location,
3640 "BASIC BLOCK VECTORIZED\n");
3641
3642 destroy_bb_vec_info (bb_vinfo);
3643 }