tree-vect-slp.c (vect_build_slp_tree_1): Init vectype.
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "alias.h"
28 #include "symtab.h"
29 #include "tree.h"
30 #include "fold-const.h"
31 #include "stor-layout.h"
32 #include "target.h"
33 #include "predict.h"
34 #include "hard-reg-set.h"
35 #include "function.h"
36 #include "basic-block.h"
37 #include "gimple-pretty-print.h"
38 #include "tree-ssa-alias.h"
39 #include "internal-fn.h"
40 #include "gimple-expr.h"
41 #include "gimple.h"
42 #include "gimple-iterator.h"
43 #include "gimple-ssa.h"
44 #include "tree-phinodes.h"
45 #include "ssa-iterators.h"
46 #include "stringpool.h"
47 #include "tree-ssanames.h"
48 #include "tree-pass.h"
49 #include "cfgloop.h"
50 #include "rtl.h"
51 #include "flags.h"
52 #include "insn-config.h"
53 #include "expmed.h"
54 #include "dojump.h"
55 #include "explow.h"
56 #include "calls.h"
57 #include "emit-rtl.h"
58 #include "varasm.h"
59 #include "stmt.h"
60 #include "expr.h"
61 #include "recog.h" /* FIXME: for insn_data */
62 #include "insn-codes.h"
63 #include "optabs.h"
64 #include "tree-vectorizer.h"
65 #include "langhooks.h"
66 #include "gimple-walk.h"
67
68 /* Extract the location of the basic block in the source code.
69 Return the basic block location if succeed and NULL if not. */
70
71 source_location
72 find_bb_location (basic_block bb)
73 {
74 gimple stmt = NULL;
75 gimple_stmt_iterator si;
76
77 if (!bb)
78 return UNKNOWN_LOCATION;
79
80 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
81 {
82 stmt = gsi_stmt (si);
83 if (gimple_location (stmt) != UNKNOWN_LOCATION)
84 return gimple_location (stmt);
85 }
86
87 return UNKNOWN_LOCATION;
88 }
89
90
91 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
92
93 static void
94 vect_free_slp_tree (slp_tree node)
95 {
96 int i;
97 slp_tree child;
98
99 if (!node)
100 return;
101
102 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
103 vect_free_slp_tree (child);
104
105 SLP_TREE_CHILDREN (node).release ();
106 SLP_TREE_SCALAR_STMTS (node).release ();
107 SLP_TREE_VEC_STMTS (node).release ();
108 SLP_TREE_LOAD_PERMUTATION (node).release ();
109
110 free (node);
111 }
112
113
114 /* Free the memory allocated for the SLP instance. */
115
116 void
117 vect_free_slp_instance (slp_instance instance)
118 {
119 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
120 SLP_INSTANCE_LOADS (instance).release ();
121 free (instance);
122 }
123
124
125 /* Create an SLP node for SCALAR_STMTS. */
126
127 static slp_tree
128 vect_create_new_slp_node (vec<gimple> scalar_stmts)
129 {
130 slp_tree node;
131 gimple stmt = scalar_stmts[0];
132 unsigned int nops;
133
134 if (is_gimple_call (stmt))
135 nops = gimple_call_num_args (stmt);
136 else if (is_gimple_assign (stmt))
137 {
138 nops = gimple_num_ops (stmt) - 1;
139 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
140 nops++;
141 }
142 else
143 return NULL;
144
145 node = XNEW (struct _slp_tree);
146 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
147 SLP_TREE_VEC_STMTS (node).create (0);
148 SLP_TREE_CHILDREN (node).create (nops);
149 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
150 SLP_TREE_TWO_OPERATORS (node) = false;
151
152 return node;
153 }
154
155
156 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
157 operand. */
158 static vec<slp_oprnd_info>
159 vect_create_oprnd_info (int nops, int group_size)
160 {
161 int i;
162 slp_oprnd_info oprnd_info;
163 vec<slp_oprnd_info> oprnds_info;
164
165 oprnds_info.create (nops);
166 for (i = 0; i < nops; i++)
167 {
168 oprnd_info = XNEW (struct _slp_oprnd_info);
169 oprnd_info->def_stmts.create (group_size);
170 oprnd_info->first_dt = vect_uninitialized_def;
171 oprnd_info->first_op_type = NULL_TREE;
172 oprnd_info->first_pattern = false;
173 oprnd_info->second_pattern = false;
174 oprnds_info.quick_push (oprnd_info);
175 }
176
177 return oprnds_info;
178 }
179
180
181 /* Free operands info. */
182
183 static void
184 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
185 {
186 int i;
187 slp_oprnd_info oprnd_info;
188
189 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
190 {
191 oprnd_info->def_stmts.release ();
192 XDELETE (oprnd_info);
193 }
194
195 oprnds_info.release ();
196 }
197
198
199 /* Find the place of the data-ref in STMT in the interleaving chain that starts
200 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
201
202 static int
203 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
204 {
205 gimple next_stmt = first_stmt;
206 int result = 0;
207
208 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
209 return -1;
210
211 do
212 {
213 if (next_stmt == stmt)
214 return result;
215 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
216 if (next_stmt)
217 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
218 }
219 while (next_stmt);
220
221 return -1;
222 }
223
224
225 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
226 they are of a valid type and that they match the defs of the first stmt of
227 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
228 return -1, if the error could be corrected by swapping operands of the
229 operation return 1, if everything is ok return 0. */
230
231 static int
232 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
233 gimple stmt, unsigned stmt_num,
234 vec<slp_oprnd_info> *oprnds_info)
235 {
236 tree oprnd;
237 unsigned int i, number_of_oprnds;
238 tree def;
239 gimple def_stmt;
240 enum vect_def_type dt = vect_uninitialized_def;
241 struct loop *loop = NULL;
242 bool pattern = false;
243 slp_oprnd_info oprnd_info;
244 int first_op_idx = 1;
245 bool commutative = false;
246 bool first_op_cond = false;
247 bool first = stmt_num == 0;
248 bool second = stmt_num == 1;
249
250 if (loop_vinfo)
251 loop = LOOP_VINFO_LOOP (loop_vinfo);
252
253 if (is_gimple_call (stmt))
254 {
255 number_of_oprnds = gimple_call_num_args (stmt);
256 first_op_idx = 3;
257 }
258 else if (is_gimple_assign (stmt))
259 {
260 enum tree_code code = gimple_assign_rhs_code (stmt);
261 number_of_oprnds = gimple_num_ops (stmt) - 1;
262 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
263 {
264 first_op_cond = true;
265 commutative = true;
266 number_of_oprnds++;
267 }
268 else
269 commutative = commutative_tree_code (code);
270 }
271 else
272 return -1;
273
274 bool swapped = false;
275 for (i = 0; i < number_of_oprnds; i++)
276 {
277 again:
278 if (first_op_cond)
279 {
280 if (i == 0 || i == 1)
281 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
282 swapped ? !i : i);
283 else
284 oprnd = gimple_op (stmt, first_op_idx + i - 1);
285 }
286 else
287 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
288
289 oprnd_info = (*oprnds_info)[i];
290
291 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
292 &def, &dt))
293 {
294 if (dump_enabled_p ())
295 {
296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
297 "Build SLP failed: can't analyze def for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
299 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
300 }
301
302 return -1;
303 }
304
305 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
306 from the pattern. Check that all the stmts of the node are in the
307 pattern. */
308 if (def_stmt && gimple_bb (def_stmt)
309 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
310 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
311 && gimple_code (def_stmt) != GIMPLE_PHI))
312 && vinfo_for_stmt (def_stmt)
313 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
314 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
315 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
316 {
317 pattern = true;
318 if (!first && !oprnd_info->first_pattern
319 /* Allow different pattern state for the defs of the
320 first stmt in reduction chains. */
321 && (oprnd_info->first_dt != vect_reduction_def
322 || (!second && !oprnd_info->second_pattern)))
323 {
324 if (i == 0
325 && !swapped
326 && commutative)
327 {
328 swapped = true;
329 goto again;
330 }
331
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335 "Build SLP failed: some of the stmts"
336 " are in a pattern, and others are not ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
338 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
339 }
340
341 return 1;
342 }
343
344 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
345 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
346
347 if (dt == vect_unknown_def_type)
348 {
349 if (dump_enabled_p ())
350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
351 "Unsupported pattern.\n");
352 return -1;
353 }
354
355 switch (gimple_code (def_stmt))
356 {
357 case GIMPLE_PHI:
358 def = gimple_phi_result (def_stmt);
359 break;
360
361 case GIMPLE_ASSIGN:
362 def = gimple_assign_lhs (def_stmt);
363 break;
364
365 default:
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
368 "unsupported defining stmt:\n");
369 return -1;
370 }
371 }
372
373 if (second)
374 oprnd_info->second_pattern = pattern;
375
376 if (first)
377 {
378 oprnd_info->first_dt = dt;
379 oprnd_info->first_pattern = pattern;
380 oprnd_info->first_op_type = TREE_TYPE (oprnd);
381 }
382 else
383 {
384 /* Not first stmt of the group, check that the def-stmt/s match
385 the def-stmt/s of the first stmt. Allow different definition
386 types for reduction chains: the first stmt must be a
387 vect_reduction_def (a phi node), and the rest
388 vect_internal_def. */
389 if (((oprnd_info->first_dt != dt
390 && !(oprnd_info->first_dt == vect_reduction_def
391 && dt == vect_internal_def)
392 && !((oprnd_info->first_dt == vect_external_def
393 || oprnd_info->first_dt == vect_constant_def)
394 && (dt == vect_external_def
395 || dt == vect_constant_def)))
396 || !types_compatible_p (oprnd_info->first_op_type,
397 TREE_TYPE (oprnd))))
398 {
399 /* Try swapping operands if we got a mismatch. */
400 if (i == 0
401 && !swapped
402 && commutative)
403 {
404 swapped = true;
405 goto again;
406 }
407
408 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "Build SLP failed: different types\n");
411
412 return 1;
413 }
414 }
415
416 /* Check the types of the definitions. */
417 switch (dt)
418 {
419 case vect_constant_def:
420 case vect_external_def:
421 case vect_reduction_def:
422 break;
423
424 case vect_internal_def:
425 oprnd_info->def_stmts.quick_push (def_stmt);
426 break;
427
428 default:
429 /* FORNOW: Not supported. */
430 if (dump_enabled_p ())
431 {
432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
433 "Build SLP failed: illegal type of def ");
434 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
435 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
436 }
437
438 return -1;
439 }
440 }
441
442 /* Swap operands. */
443 if (swapped)
444 {
445 if (first_op_cond)
446 {
447 tree cond = gimple_assign_rhs1 (stmt);
448 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
449 &TREE_OPERAND (cond, 1));
450 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
451 }
452 else
453 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
454 gimple_assign_rhs2_ptr (stmt));
455 }
456
457 return 0;
458 }
459
460
461 /* Verify if the scalar stmts STMTS are isomorphic, require data
462 permutation or are of unsupported types of operation. Return
463 true if they are, otherwise return false and indicate in *MATCHES
464 which stmts are not isomorphic to the first one. If MATCHES[0]
465 is false then this indicates the comparison could not be
466 carried out or the stmts will never be vectorized by SLP. */
467
468 static bool
469 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
470 vec<gimple> stmts, unsigned int group_size,
471 unsigned nops, unsigned int *max_nunits,
472 unsigned int vectorization_factor, bool *matches,
473 bool *two_operators)
474 {
475 unsigned int i;
476 gimple first_stmt = stmts[0], stmt = stmts[0];
477 enum tree_code first_stmt_code = ERROR_MARK;
478 enum tree_code alt_stmt_code = ERROR_MARK;
479 enum tree_code rhs_code = ERROR_MARK;
480 enum tree_code first_cond_code = ERROR_MARK;
481 tree lhs;
482 bool need_same_oprnds = false;
483 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
484 optab optab;
485 int icode;
486 machine_mode optab_op2_mode;
487 machine_mode vec_mode;
488 HOST_WIDE_INT dummy;
489 gimple first_load = NULL, prev_first_load = NULL;
490 tree cond;
491
492 /* For every stmt in NODE find its def stmt/s. */
493 FOR_EACH_VEC_ELT (stmts, i, stmt)
494 {
495 matches[i] = false;
496
497 if (dump_enabled_p ())
498 {
499 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501 dump_printf (MSG_NOTE, "\n");
502 }
503
504 /* Fail to vectorize statements marked as unvectorizable. */
505 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506 {
507 if (dump_enabled_p ())
508 {
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 "Build SLP failed: unvectorizable statement ");
511 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513 }
514 /* Fatal mismatch. */
515 matches[0] = false;
516 return false;
517 }
518
519 lhs = gimple_get_lhs (stmt);
520 if (lhs == NULL_TREE)
521 {
522 if (dump_enabled_p ())
523 {
524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 "Build SLP failed: not GIMPLE_ASSIGN nor "
526 "GIMPLE_CALL ");
527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 }
530 /* Fatal mismatch. */
531 matches[0] = false;
532 return false;
533 }
534
535 if (is_gimple_assign (stmt)
536 && gimple_assign_rhs_code (stmt) == COND_EXPR
537 && (cond = gimple_assign_rhs1 (stmt))
538 && !COMPARISON_CLASS_P (cond))
539 {
540 if (dump_enabled_p ())
541 {
542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 "Build SLP failed: condition is not "
544 "comparison ");
545 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547 }
548 /* Fatal mismatch. */
549 matches[0] = false;
550 return false;
551 }
552
553 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554 vectype = get_vectype_for_scalar_type (scalar_type);
555 if (!vectype)
556 {
557 if (dump_enabled_p ())
558 {
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: unsupported data-type ");
561 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562 scalar_type);
563 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564 }
565 /* Fatal mismatch. */
566 matches[0] = false;
567 return false;
568 }
569
570 /* If populating the vector type requires unrolling then fail
571 before adjusting *max_nunits for basic-block vectorization. */
572 if (bb_vinfo
573 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
574 {
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "Build SLP failed: unrolling required "
577 "in basic block SLP\n");
578 /* Fatal mismatch. */
579 matches[0] = false;
580 return false;
581 }
582
583 /* In case of multiple types we need to detect the smallest type. */
584 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
585 {
586 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
587 if (bb_vinfo)
588 vectorization_factor = *max_nunits;
589 }
590
591 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
592 {
593 rhs_code = CALL_EXPR;
594 if (gimple_call_internal_p (call_stmt)
595 || gimple_call_tail_p (call_stmt)
596 || gimple_call_noreturn_p (call_stmt)
597 || !gimple_call_nothrow_p (call_stmt)
598 || gimple_call_chain (call_stmt))
599 {
600 if (dump_enabled_p ())
601 {
602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
603 "Build SLP failed: unsupported call type ");
604 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
605 call_stmt, 0);
606 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
607 }
608 /* Fatal mismatch. */
609 matches[0] = false;
610 return false;
611 }
612 }
613 else
614 rhs_code = gimple_assign_rhs_code (stmt);
615
616 /* Check the operation. */
617 if (i == 0)
618 {
619 first_stmt_code = rhs_code;
620
621 /* Shift arguments should be equal in all the packed stmts for a
622 vector shift with scalar shift operand. */
623 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
624 || rhs_code == LROTATE_EXPR
625 || rhs_code == RROTATE_EXPR)
626 {
627 vec_mode = TYPE_MODE (vectype);
628
629 /* First see if we have a vector/vector shift. */
630 optab = optab_for_tree_code (rhs_code, vectype,
631 optab_vector);
632
633 if (!optab
634 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
635 {
636 /* No vector/vector shift, try for a vector/scalar shift. */
637 optab = optab_for_tree_code (rhs_code, vectype,
638 optab_scalar);
639
640 if (!optab)
641 {
642 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "Build SLP failed: no optab.\n");
645 /* Fatal mismatch. */
646 matches[0] = false;
647 return false;
648 }
649 icode = (int) optab_handler (optab, vec_mode);
650 if (icode == CODE_FOR_nothing)
651 {
652 if (dump_enabled_p ())
653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
654 "Build SLP failed: "
655 "op not supported by target.\n");
656 /* Fatal mismatch. */
657 matches[0] = false;
658 return false;
659 }
660 optab_op2_mode = insn_data[icode].operand[2].mode;
661 if (!VECTOR_MODE_P (optab_op2_mode))
662 {
663 need_same_oprnds = true;
664 first_op1 = gimple_assign_rhs2 (stmt);
665 }
666 }
667 }
668 else if (rhs_code == WIDEN_LSHIFT_EXPR)
669 {
670 need_same_oprnds = true;
671 first_op1 = gimple_assign_rhs2 (stmt);
672 }
673 }
674 else
675 {
676 if (first_stmt_code != rhs_code
677 && alt_stmt_code == ERROR_MARK)
678 alt_stmt_code = rhs_code;
679 if (first_stmt_code != rhs_code
680 && (first_stmt_code != IMAGPART_EXPR
681 || rhs_code != REALPART_EXPR)
682 && (first_stmt_code != REALPART_EXPR
683 || rhs_code != IMAGPART_EXPR)
684 /* Handle mismatches in plus/minus by computing both
685 and merging the results. */
686 && !((first_stmt_code == PLUS_EXPR
687 || first_stmt_code == MINUS_EXPR)
688 && (alt_stmt_code == PLUS_EXPR
689 || alt_stmt_code == MINUS_EXPR)
690 && rhs_code == alt_stmt_code)
691 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
692 && (first_stmt_code == ARRAY_REF
693 || first_stmt_code == BIT_FIELD_REF
694 || first_stmt_code == INDIRECT_REF
695 || first_stmt_code == COMPONENT_REF
696 || first_stmt_code == MEM_REF)))
697 {
698 if (dump_enabled_p ())
699 {
700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
701 "Build SLP failed: different operation "
702 "in stmt ");
703 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
705 "original stmt ");
706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
707 first_stmt, 0);
708 }
709 /* Mismatch. */
710 continue;
711 }
712
713 if (need_same_oprnds
714 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
715 {
716 if (dump_enabled_p ())
717 {
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Build SLP failed: different shift "
720 "arguments in ");
721 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
722 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
723 }
724 /* Mismatch. */
725 continue;
726 }
727
728 if (rhs_code == CALL_EXPR)
729 {
730 gimple first_stmt = stmts[0];
731 if (gimple_call_num_args (stmt) != nops
732 || !operand_equal_p (gimple_call_fn (first_stmt),
733 gimple_call_fn (stmt), 0)
734 || gimple_call_fntype (first_stmt)
735 != gimple_call_fntype (stmt))
736 {
737 if (dump_enabled_p ())
738 {
739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
740 "Build SLP failed: different calls in ");
741 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
742 stmt, 0);
743 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
744 }
745 /* Mismatch. */
746 continue;
747 }
748 }
749 }
750
751 /* Grouped store or load. */
752 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
753 {
754 if (REFERENCE_CLASS_P (lhs))
755 {
756 /* Store. */
757 ;
758 }
759 else
760 {
761 /* Load. */
762 /* Check that the size of interleaved loads group is not
763 greater than the SLP group size. */
764 unsigned ncopies
765 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
766 if (loop_vinfo
767 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
768 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
769 - GROUP_GAP (vinfo_for_stmt (stmt)))
770 > ncopies * group_size))
771 {
772 if (dump_enabled_p ())
773 {
774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
775 "Build SLP failed: the number "
776 "of interleaved loads is greater than "
777 "the SLP group size ");
778 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
779 stmt, 0);
780 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
781 }
782 /* Fatal mismatch. */
783 matches[0] = false;
784 return false;
785 }
786
787 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
788 if (prev_first_load)
789 {
790 /* Check that there are no loads from different interleaving
791 chains in the same node. */
792 if (prev_first_load != first_load)
793 {
794 if (dump_enabled_p ())
795 {
796 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
797 vect_location,
798 "Build SLP failed: different "
799 "interleaving chains in one node ");
800 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
801 stmt, 0);
802 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
803 }
804 /* Mismatch. */
805 continue;
806 }
807 }
808 else
809 prev_first_load = first_load;
810 }
811 } /* Grouped access. */
812 else
813 {
814 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
815 {
816 /* Not grouped load. */
817 if (dump_enabled_p ())
818 {
819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
820 "Build SLP failed: not grouped load ");
821 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
822 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
823 }
824
825 /* FORNOW: Not grouped loads are not supported. */
826 /* Fatal mismatch. */
827 matches[0] = false;
828 return false;
829 }
830
831 /* Not memory operation. */
832 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
833 && TREE_CODE_CLASS (rhs_code) != tcc_unary
834 && TREE_CODE_CLASS (rhs_code) != tcc_expression
835 && rhs_code != CALL_EXPR)
836 {
837 if (dump_enabled_p ())
838 {
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: operation");
841 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
842 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
843 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
844 }
845 /* Fatal mismatch. */
846 matches[0] = false;
847 return false;
848 }
849
850 if (rhs_code == COND_EXPR)
851 {
852 tree cond_expr = gimple_assign_rhs1 (stmt);
853
854 if (i == 0)
855 first_cond_code = TREE_CODE (cond_expr);
856 else if (first_cond_code != TREE_CODE (cond_expr))
857 {
858 if (dump_enabled_p ())
859 {
860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
861 "Build SLP failed: different"
862 " operation");
863 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
864 stmt, 0);
865 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
866 }
867 /* Mismatch. */
868 continue;
869 }
870 }
871 }
872
873 matches[i] = true;
874 }
875
876 for (i = 0; i < group_size; ++i)
877 if (!matches[i])
878 return false;
879
880 /* If we allowed a two-operation SLP node verify the target can cope
881 with the permute we are going to use. */
882 if (alt_stmt_code != ERROR_MARK
883 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
884 {
885 unsigned char *sel
886 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
887 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
888 {
889 sel[i] = i;
890 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
891 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
892 }
893 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
894 {
895 for (i = 0; i < group_size; ++i)
896 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
897 {
898 matches[i] = false;
899 if (dump_enabled_p ())
900 {
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: different operation "
903 "in stmt ");
904 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
905 stmts[i], 0);
906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 "original stmt ");
908 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
909 first_stmt, 0);
910 }
911 }
912 return false;
913 }
914 *two_operators = true;
915 }
916
917 return true;
918 }
919
920 /* Recursively build an SLP tree starting from NODE.
921 Fail (and return a value not equal to zero) if def-stmts are not
922 isomorphic, require data permutation or are of unsupported types of
923 operation. Otherwise, return 0.
924 The value returned is the depth in the SLP tree where a mismatch
925 was found. */
926
927 static bool
928 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
929 slp_tree *node, unsigned int group_size,
930 unsigned int *max_nunits,
931 vec<slp_tree> *loads,
932 unsigned int vectorization_factor,
933 bool *matches, unsigned *npermutes, unsigned *tree_size,
934 unsigned max_tree_size)
935 {
936 unsigned nops, i, this_tree_size = 0;
937 gimple stmt;
938
939 matches[0] = false;
940
941 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
942 if (is_gimple_call (stmt))
943 nops = gimple_call_num_args (stmt);
944 else if (is_gimple_assign (stmt))
945 {
946 nops = gimple_num_ops (stmt) - 1;
947 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
948 nops++;
949 }
950 else
951 return false;
952
953 bool two_operators = false;
954 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
955 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
956 max_nunits, vectorization_factor, matches,
957 &two_operators))
958 return false;
959 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
960
961 /* If the SLP node is a load, terminate the recursion. */
962 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
963 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
964 {
965 loads->safe_push (*node);
966 return true;
967 }
968
969 /* Get at the operands, verifying they are compatible. */
970 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
971 slp_oprnd_info oprnd_info;
972 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
973 {
974 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
975 stmt, i, &oprnds_info))
976 {
977 case 0:
978 break;
979 case -1:
980 matches[0] = false;
981 vect_free_oprnd_info (oprnds_info);
982 return false;
983 case 1:
984 matches[i] = false;
985 break;
986 }
987 }
988 for (i = 0; i < group_size; ++i)
989 if (!matches[i])
990 {
991 vect_free_oprnd_info (oprnds_info);
992 return false;
993 }
994
995 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
996
997 /* Create SLP_TREE nodes for the definition node/s. */
998 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
999 {
1000 slp_tree child;
1001 unsigned old_nloads = loads->length ();
1002 unsigned old_max_nunits = *max_nunits;
1003
1004 if (oprnd_info->first_dt != vect_internal_def)
1005 continue;
1006
1007 if (++this_tree_size > max_tree_size)
1008 {
1009 vect_free_oprnd_info (oprnds_info);
1010 return false;
1011 }
1012
1013 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1014 if (!child)
1015 {
1016 vect_free_oprnd_info (oprnds_info);
1017 return false;
1018 }
1019
1020 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1021 group_size, max_nunits, loads,
1022 vectorization_factor, matches,
1023 npermutes, &this_tree_size, max_tree_size))
1024 {
1025 /* If we have all children of child built up from scalars then just
1026 throw that away and build it up this node from scalars. */
1027 if (!SLP_TREE_CHILDREN (child).is_empty ())
1028 {
1029 unsigned int j;
1030 slp_tree grandchild;
1031
1032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1033 if (grandchild != NULL)
1034 break;
1035 if (!grandchild)
1036 {
1037 /* Roll back. */
1038 *max_nunits = old_max_nunits;
1039 loads->truncate (old_nloads);
1040 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1041 vect_free_slp_tree (grandchild);
1042 SLP_TREE_CHILDREN (child).truncate (0);
1043
1044 dump_printf_loc (MSG_NOTE, vect_location,
1045 "Building parent vector operands from "
1046 "scalars instead\n");
1047 oprnd_info->def_stmts = vNULL;
1048 vect_free_slp_tree (child);
1049 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1050 continue;
1051 }
1052 }
1053
1054 oprnd_info->def_stmts = vNULL;
1055 SLP_TREE_CHILDREN (*node).quick_push (child);
1056 continue;
1057 }
1058
1059 /* If the SLP build failed fatally and we analyze a basic-block
1060 simply treat nodes we fail to build as externally defined
1061 (and thus build vectors from the scalar defs).
1062 The cost model will reject outright expensive cases.
1063 ??? This doesn't treat cases where permutation ultimatively
1064 fails (or we don't try permutation below). Ideally we'd
1065 even compute a permutation that will end up with the maximum
1066 SLP tree size... */
1067 if (bb_vinfo
1068 && !matches[0]
1069 /* ??? Rejecting patterns this way doesn't work. We'd have to
1070 do extra work to cancel the pattern so the uses see the
1071 scalar version. */
1072 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1073 {
1074 unsigned int j;
1075 slp_tree grandchild;
1076
1077 /* Roll back. */
1078 *max_nunits = old_max_nunits;
1079 loads->truncate (old_nloads);
1080 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1081 vect_free_slp_tree (grandchild);
1082 SLP_TREE_CHILDREN (child).truncate (0);
1083
1084 dump_printf_loc (MSG_NOTE, vect_location,
1085 "Building vector operands from scalars\n");
1086 oprnd_info->def_stmts = vNULL;
1087 vect_free_slp_tree (child);
1088 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1089 continue;
1090 }
1091
1092 /* If the SLP build for operand zero failed and operand zero
1093 and one can be commutated try that for the scalar stmts
1094 that failed the match. */
1095 if (i == 0
1096 /* A first scalar stmt mismatch signals a fatal mismatch. */
1097 && matches[0]
1098 /* ??? For COND_EXPRs we can swap the comparison operands
1099 as well as the arms under some constraints. */
1100 && nops == 2
1101 && oprnds_info[1]->first_dt == vect_internal_def
1102 && is_gimple_assign (stmt)
1103 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1104 && !SLP_TREE_TWO_OPERATORS (*node)
1105 /* Do so only if the number of not successful permutes was nor more
1106 than a cut-ff as re-trying the recursive match on
1107 possibly each level of the tree would expose exponential
1108 behavior. */
1109 && *npermutes < 4)
1110 {
1111 unsigned int j;
1112 slp_tree grandchild;
1113
1114 /* Roll back. */
1115 *max_nunits = old_max_nunits;
1116 loads->truncate (old_nloads);
1117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1118 vect_free_slp_tree (grandchild);
1119 SLP_TREE_CHILDREN (child).truncate (0);
1120
1121 /* Swap mismatched definition stmts. */
1122 dump_printf_loc (MSG_NOTE, vect_location,
1123 "Re-trying with swapped operands of stmts ");
1124 for (j = 0; j < group_size; ++j)
1125 if (!matches[j])
1126 {
1127 std::swap (oprnds_info[0]->def_stmts[j],
1128 oprnds_info[1]->def_stmts[j]);
1129 dump_printf (MSG_NOTE, "%d ", j);
1130 }
1131 dump_printf (MSG_NOTE, "\n");
1132 /* And try again with scratch 'matches' ... */
1133 bool *tem = XALLOCAVEC (bool, group_size);
1134 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1135 group_size, max_nunits, loads,
1136 vectorization_factor,
1137 tem, npermutes, &this_tree_size,
1138 max_tree_size))
1139 {
1140 /* ... so if successful we can apply the operand swapping
1141 to the GIMPLE IL. This is necessary because for example
1142 vect_get_slp_defs uses operand indexes and thus expects
1143 canonical operand order. */
1144 for (j = 0; j < group_size; ++j)
1145 if (!matches[j])
1146 {
1147 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1148 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1149 gimple_assign_rhs2_ptr (stmt));
1150 }
1151 oprnd_info->def_stmts = vNULL;
1152 SLP_TREE_CHILDREN (*node).quick_push (child);
1153 continue;
1154 }
1155
1156 ++*npermutes;
1157 }
1158
1159 oprnd_info->def_stmts = vNULL;
1160 vect_free_slp_tree (child);
1161 vect_free_oprnd_info (oprnds_info);
1162 return false;
1163 }
1164
1165 if (tree_size)
1166 *tree_size += this_tree_size;
1167
1168 vect_free_oprnd_info (oprnds_info);
1169 return true;
1170 }
1171
1172 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1173
1174 static void
1175 vect_print_slp_tree (int dump_kind, slp_tree node)
1176 {
1177 int i;
1178 gimple stmt;
1179 slp_tree child;
1180
1181 if (!node)
1182 return;
1183
1184 dump_printf (dump_kind, "node ");
1185 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1186 {
1187 dump_printf (dump_kind, "\n\tstmt %d ", i);
1188 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1189 }
1190 dump_printf (dump_kind, "\n");
1191
1192 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1193 vect_print_slp_tree (dump_kind, child);
1194 }
1195
1196
1197 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1198 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1199 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1200 stmts in NODE are to be marked. */
1201
1202 static void
1203 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1204 {
1205 int i;
1206 gimple stmt;
1207 slp_tree child;
1208
1209 if (!node)
1210 return;
1211
1212 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1213 if (j < 0 || i == j)
1214 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1215
1216 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1217 vect_mark_slp_stmts (child, mark, j);
1218 }
1219
1220
1221 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1222
1223 static void
1224 vect_mark_slp_stmts_relevant (slp_tree node)
1225 {
1226 int i;
1227 gimple stmt;
1228 stmt_vec_info stmt_info;
1229 slp_tree child;
1230
1231 if (!node)
1232 return;
1233
1234 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1235 {
1236 stmt_info = vinfo_for_stmt (stmt);
1237 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1238 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1239 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1240 }
1241
1242 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1243 vect_mark_slp_stmts_relevant (child);
1244 }
1245
1246
1247 /* Rearrange the statements of NODE according to PERMUTATION. */
1248
1249 static void
1250 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1251 vec<unsigned> permutation)
1252 {
1253 gimple stmt;
1254 vec<gimple> tmp_stmts;
1255 unsigned int i;
1256 slp_tree child;
1257
1258 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1259 vect_slp_rearrange_stmts (child, group_size, permutation);
1260
1261 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1262 tmp_stmts.create (group_size);
1263 tmp_stmts.quick_grow_cleared (group_size);
1264
1265 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1266 tmp_stmts[permutation[i]] = stmt;
1267
1268 SLP_TREE_SCALAR_STMTS (node).release ();
1269 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1270 }
1271
1272
1273 /* Attempt to reorder stmts in a reduction chain so that we don't
1274 require any load permutation. Return true if that was possible,
1275 otherwise return false. */
1276
1277 static bool
1278 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1279 {
1280 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1281 unsigned int i, j;
1282 sbitmap load_index;
1283 unsigned int lidx;
1284 slp_tree node, load;
1285
1286 /* Compare all the permutation sequences to the first one. We know
1287 that at least one load is permuted. */
1288 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1289 if (!node->load_permutation.exists ())
1290 return false;
1291 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1292 {
1293 if (!load->load_permutation.exists ())
1294 return false;
1295 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1296 if (lidx != node->load_permutation[j])
1297 return false;
1298 }
1299
1300 /* Check that the loads in the first sequence are different and there
1301 are no gaps between them. */
1302 load_index = sbitmap_alloc (group_size);
1303 bitmap_clear (load_index);
1304 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1305 {
1306 if (bitmap_bit_p (load_index, lidx))
1307 {
1308 sbitmap_free (load_index);
1309 return false;
1310 }
1311 bitmap_set_bit (load_index, lidx);
1312 }
1313 for (i = 0; i < group_size; i++)
1314 if (!bitmap_bit_p (load_index, i))
1315 {
1316 sbitmap_free (load_index);
1317 return false;
1318 }
1319 sbitmap_free (load_index);
1320
1321 /* This permutation is valid for reduction. Since the order of the
1322 statements in the nodes is not important unless they are memory
1323 accesses, we can rearrange the statements in all the nodes
1324 according to the order of the loads. */
1325 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1326 node->load_permutation);
1327
1328 /* We are done, no actual permutations need to be generated. */
1329 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1330 SLP_TREE_LOAD_PERMUTATION (node).release ();
1331 return true;
1332 }
1333
1334 /* Check if the required load permutations in the SLP instance
1335 SLP_INSTN are supported. */
1336
1337 static bool
1338 vect_supported_load_permutation_p (slp_instance slp_instn)
1339 {
1340 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1341 unsigned int i, j, k, next;
1342 slp_tree node;
1343 gimple stmt, load, next_load, first_load;
1344 struct data_reference *dr;
1345
1346 if (dump_enabled_p ())
1347 {
1348 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1349 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1350 if (node->load_permutation.exists ())
1351 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1352 dump_printf (MSG_NOTE, "%d ", next);
1353 else
1354 for (k = 0; k < group_size; ++k)
1355 dump_printf (MSG_NOTE, "%d ", k);
1356 dump_printf (MSG_NOTE, "\n");
1357 }
1358
1359 /* In case of reduction every load permutation is allowed, since the order
1360 of the reduction statements is not important (as opposed to the case of
1361 grouped stores). The only condition we need to check is that all the
1362 load nodes are of the same size and have the same permutation (and then
1363 rearrange all the nodes of the SLP instance according to this
1364 permutation). */
1365
1366 /* Check that all the load nodes are of the same size. */
1367 /* ??? Can't we assert this? */
1368 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1369 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1370 return false;
1371
1372 node = SLP_INSTANCE_TREE (slp_instn);
1373 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1374
1375 /* Reduction (there are no data-refs in the root).
1376 In reduction chain the order of the loads is not important. */
1377 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1378 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1379 {
1380 if (vect_attempt_slp_rearrange_stmts (slp_instn))
1381 return true;
1382
1383 /* Fallthru to general load permutation handling. */
1384 }
1385
1386 /* In basic block vectorization we allow any subchain of an interleaving
1387 chain.
1388 FORNOW: not supported in loop SLP because of realignment compications. */
1389 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1390 {
1391 /* Check whether the loads in an instance form a subchain and thus
1392 no permutation is necessary. */
1393 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1394 {
1395 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1396 continue;
1397 bool subchain_p = true;
1398 next_load = NULL;
1399 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1400 {
1401 if (j != 0
1402 && (next_load != load
1403 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1404 {
1405 subchain_p = false;
1406 break;
1407 }
1408 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1409 }
1410 if (subchain_p)
1411 SLP_TREE_LOAD_PERMUTATION (node).release ();
1412 else
1413 {
1414 /* Verify the permutation can be generated. */
1415 vec<tree> tem;
1416 if (!vect_transform_slp_perm_load (node, tem, NULL,
1417 1, slp_instn, true))
1418 {
1419 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1420 vect_location,
1421 "unsupported load permutation\n");
1422 return false;
1423 }
1424 }
1425 }
1426
1427 /* Check that the alignment of the first load in every subchain, i.e.,
1428 the first statement in every load node, is supported.
1429 ??? This belongs in alignment checking. */
1430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1431 {
1432 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1433 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1434 {
1435 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1436 if (vect_supportable_dr_alignment (dr, false)
1437 == dr_unaligned_unsupported)
1438 {
1439 if (dump_enabled_p ())
1440 {
1441 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1442 vect_location,
1443 "unsupported unaligned load ");
1444 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1445 first_load, 0);
1446 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1447 }
1448 return false;
1449 }
1450 }
1451 }
1452
1453 return true;
1454 }
1455
1456 /* For loop vectorization verify we can generate the permutation. */
1457 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1458 if (node->load_permutation.exists ()
1459 && !vect_transform_slp_perm_load
1460 (node, vNULL, NULL,
1461 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1462 return false;
1463
1464 return true;
1465 }
1466
1467
1468 /* Find the last store in SLP INSTANCE. */
1469
1470 static gimple
1471 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1472 {
1473 gimple last = NULL, stmt;
1474
1475 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1476 {
1477 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1478 if (is_pattern_stmt_p (stmt_vinfo))
1479 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1480 else
1481 last = get_later_stmt (stmt, last);
1482 }
1483
1484 return last;
1485 }
1486
1487 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1488
1489 static void
1490 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1491 stmt_vector_for_cost *prologue_cost_vec,
1492 stmt_vector_for_cost *body_cost_vec,
1493 unsigned ncopies_for_cost)
1494 {
1495 unsigned i;
1496 slp_tree child;
1497 gimple stmt, s;
1498 stmt_vec_info stmt_info;
1499 tree lhs;
1500 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1501
1502 /* Recurse down the SLP tree. */
1503 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1504 if (child)
1505 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1506 body_cost_vec, ncopies_for_cost);
1507
1508 /* Look at the first scalar stmt to determine the cost. */
1509 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1510 stmt_info = vinfo_for_stmt (stmt);
1511 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1512 {
1513 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1514 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1515 vect_uninitialized_def,
1516 node, prologue_cost_vec, body_cost_vec);
1517 else
1518 {
1519 int i;
1520 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1521 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1522 node, prologue_cost_vec, body_cost_vec);
1523 /* If the load is permuted record the cost for the permutation.
1524 ??? Loads from multiple chains are let through here only
1525 for a single special case involving complex numbers where
1526 in the end no permutation is necessary. */
1527 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1528 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1529 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1530 && vect_get_place_in_interleaving_chain
1531 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1532 {
1533 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1534 stmt_info, 0, vect_body);
1535 break;
1536 }
1537 }
1538 }
1539 else
1540 {
1541 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1542 stmt_info, 0, vect_body);
1543 if (SLP_TREE_TWO_OPERATORS (node))
1544 {
1545 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1546 stmt_info, 0, vect_body);
1547 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1548 stmt_info, 0, vect_body);
1549 }
1550 }
1551
1552 /* Scan operands and account for prologue cost of constants/externals.
1553 ??? This over-estimates cost for multiple uses and should be
1554 re-engineered. */
1555 lhs = gimple_get_lhs (stmt);
1556 for (i = 0; i < gimple_num_ops (stmt); ++i)
1557 {
1558 tree def, op = gimple_op (stmt, i);
1559 gimple def_stmt;
1560 enum vect_def_type dt;
1561 if (!op || op == lhs)
1562 continue;
1563 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1564 STMT_VINFO_BB_VINFO (stmt_info),
1565 &def_stmt, &def, &dt))
1566 {
1567 /* Without looking at the actual initializer a vector of
1568 constants can be implemented as load from the constant pool.
1569 ??? We need to pass down stmt_info for a vector type
1570 even if it points to the wrong stmt. */
1571 if (dt == vect_constant_def)
1572 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1573 stmt_info, 0, vect_prologue);
1574 else if (dt == vect_external_def)
1575 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1576 stmt_info, 0, vect_prologue);
1577 }
1578 }
1579 }
1580
1581 /* Compute the cost for the SLP instance INSTANCE. */
1582
1583 static void
1584 vect_analyze_slp_cost (slp_instance instance, void *data)
1585 {
1586 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1587 unsigned ncopies_for_cost;
1588 stmt_info_for_cost *si;
1589 unsigned i;
1590
1591 /* Calculate the number of vector stmts to create based on the unrolling
1592 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1593 GROUP_SIZE / NUNITS otherwise. */
1594 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1595 slp_tree node = SLP_INSTANCE_TREE (instance);
1596 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1597 /* Adjust the group_size by the vectorization factor which is always one
1598 for basic-block vectorization. */
1599 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1600 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1601 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1602 /* For reductions look at a reduction operand in case the reduction
1603 operation is widening like DOT_PROD or SAD. */
1604 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1605 {
1606 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1607 switch (gimple_assign_rhs_code (stmt))
1608 {
1609 case DOT_PROD_EXPR:
1610 case SAD_EXPR:
1611 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1612 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1613 break;
1614 default:;
1615 }
1616 }
1617 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1618
1619 prologue_cost_vec.create (10);
1620 body_cost_vec.create (10);
1621 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1622 &prologue_cost_vec, &body_cost_vec,
1623 ncopies_for_cost);
1624
1625 /* Record the prologue costs, which were delayed until we were
1626 sure that SLP was successful. */
1627 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1628 {
1629 struct _stmt_vec_info *stmt_info
1630 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1631 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1632 si->misalign, vect_prologue);
1633 }
1634
1635 /* Record the instance's instructions in the target cost model. */
1636 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1637 {
1638 struct _stmt_vec_info *stmt_info
1639 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1640 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1641 si->misalign, vect_body);
1642 }
1643
1644 prologue_cost_vec.release ();
1645 body_cost_vec.release ();
1646 }
1647
1648 /* Analyze an SLP instance starting from a group of grouped stores. Call
1649 vect_build_slp_tree to build a tree of packed stmts if possible.
1650 Return FALSE if it's impossible to SLP any stmt in the loop. */
1651
1652 static bool
1653 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1654 gimple stmt, unsigned max_tree_size)
1655 {
1656 slp_instance new_instance;
1657 slp_tree node;
1658 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1659 unsigned int unrolling_factor = 1, nunits;
1660 tree vectype, scalar_type = NULL_TREE;
1661 gimple next;
1662 unsigned int vectorization_factor = 0;
1663 int i;
1664 unsigned int max_nunits = 0;
1665 vec<slp_tree> loads;
1666 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1667 vec<gimple> scalar_stmts;
1668
1669 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1670 {
1671 if (dr)
1672 {
1673 scalar_type = TREE_TYPE (DR_REF (dr));
1674 vectype = get_vectype_for_scalar_type (scalar_type);
1675 }
1676 else
1677 {
1678 gcc_assert (loop_vinfo);
1679 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1680 }
1681
1682 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1683 }
1684 else
1685 {
1686 gcc_assert (loop_vinfo);
1687 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1688 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1689 }
1690
1691 if (!vectype)
1692 {
1693 if (dump_enabled_p ())
1694 {
1695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696 "Build SLP failed: unsupported data-type ");
1697 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1698 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1699 }
1700
1701 return false;
1702 }
1703
1704 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1705 if (loop_vinfo)
1706 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1707 else
1708 vectorization_factor = nunits;
1709
1710 /* Calculate the unrolling factor. */
1711 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1712 if (unrolling_factor != 1 && !loop_vinfo)
1713 {
1714 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716 "Build SLP failed: unrolling required in basic"
1717 " block SLP\n");
1718
1719 return false;
1720 }
1721
1722 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1723 scalar_stmts.create (group_size);
1724 next = stmt;
1725 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1726 {
1727 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1728 while (next)
1729 {
1730 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1731 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1732 scalar_stmts.safe_push (
1733 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1734 else
1735 scalar_stmts.safe_push (next);
1736 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1737 }
1738 /* Mark the first element of the reduction chain as reduction to properly
1739 transform the node. In the reduction analysis phase only the last
1740 element of the chain is marked as reduction. */
1741 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1742 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1743 }
1744 else
1745 {
1746 /* Collect reduction statements. */
1747 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1748 for (i = 0; reductions.iterate (i, &next); i++)
1749 scalar_stmts.safe_push (next);
1750 }
1751
1752 node = vect_create_new_slp_node (scalar_stmts);
1753
1754 loads.create (group_size);
1755
1756 /* Build the tree for the SLP instance. */
1757 bool *matches = XALLOCAVEC (bool, group_size);
1758 unsigned npermutes = 0;
1759 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1760 &max_nunits, &loads,
1761 vectorization_factor, matches, &npermutes, NULL,
1762 max_tree_size))
1763 {
1764 /* Calculate the unrolling factor based on the smallest type. */
1765 if (max_nunits > nunits)
1766 unrolling_factor = least_common_multiple (max_nunits, group_size)
1767 / group_size;
1768
1769 if (unrolling_factor != 1 && !loop_vinfo)
1770 {
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "Build SLP failed: unrolling required in basic"
1774 " block SLP\n");
1775 vect_free_slp_tree (node);
1776 loads.release ();
1777 return false;
1778 }
1779
1780 /* Create a new SLP instance. */
1781 new_instance = XNEW (struct _slp_instance);
1782 SLP_INSTANCE_TREE (new_instance) = node;
1783 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1784 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1785 SLP_INSTANCE_LOADS (new_instance) = loads;
1786
1787 /* Compute the load permutation. */
1788 slp_tree load_node;
1789 bool loads_permuted = false;
1790 FOR_EACH_VEC_ELT (loads, i, load_node)
1791 {
1792 vec<unsigned> load_permutation;
1793 int j;
1794 gimple load, first_stmt;
1795 bool this_load_permuted = false;
1796 load_permutation.create (group_size);
1797 first_stmt = GROUP_FIRST_ELEMENT
1798 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1799 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1800 {
1801 int load_place
1802 = vect_get_place_in_interleaving_chain (load, first_stmt);
1803 gcc_assert (load_place != -1);
1804 if (load_place != j)
1805 this_load_permuted = true;
1806 load_permutation.safe_push (load_place);
1807 }
1808 if (!this_load_permuted
1809 /* The load requires permutation when unrolling exposes
1810 a gap either because the group is larger than the SLP
1811 group-size or because there is a gap between the groups. */
1812 && (unrolling_factor == 1
1813 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1814 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1815 {
1816 load_permutation.release ();
1817 continue;
1818 }
1819 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1820 loads_permuted = true;
1821 }
1822
1823 if (loads_permuted)
1824 {
1825 if (!vect_supported_load_permutation_p (new_instance))
1826 {
1827 if (dump_enabled_p ())
1828 {
1829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1830 "Build SLP failed: unsupported load "
1831 "permutation ");
1832 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1834 }
1835 vect_free_slp_instance (new_instance);
1836 return false;
1837 }
1838 }
1839
1840
1841 if (loop_vinfo)
1842 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1843 else
1844 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1845
1846 if (dump_enabled_p ())
1847 vect_print_slp_tree (MSG_NOTE, node);
1848
1849 return true;
1850 }
1851
1852 /* Failed to SLP. */
1853 /* Free the allocated memory. */
1854 vect_free_slp_tree (node);
1855 loads.release ();
1856
1857 return false;
1858 }
1859
1860
1861 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1862 trees of packed scalar stmts if SLP is possible. */
1863
1864 bool
1865 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1866 unsigned max_tree_size)
1867 {
1868 unsigned int i;
1869 vec<gimple> grouped_stores;
1870 vec<gimple> reductions = vNULL;
1871 vec<gimple> reduc_chains = vNULL;
1872 gimple first_element;
1873 bool ok = false;
1874
1875 if (dump_enabled_p ())
1876 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1877
1878 if (loop_vinfo)
1879 {
1880 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1881 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1882 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1883 }
1884 else
1885 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1886
1887 /* Find SLP sequences starting from groups of grouped stores. */
1888 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1889 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1890 max_tree_size))
1891 ok = true;
1892
1893 if (reduc_chains.length () > 0)
1894 {
1895 /* Find SLP sequences starting from reduction chains. */
1896 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1897 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1898 max_tree_size))
1899 ok = true;
1900 else
1901 return false;
1902
1903 /* Don't try to vectorize SLP reductions if reduction chain was
1904 detected. */
1905 return ok;
1906 }
1907
1908 /* Find SLP sequences starting from groups of reductions. */
1909 if (reductions.length () > 1
1910 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1911 max_tree_size))
1912 ok = true;
1913
1914 return true;
1915 }
1916
1917
1918 /* For each possible SLP instance decide whether to SLP it and calculate overall
1919 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1920 least one instance. */
1921
1922 bool
1923 vect_make_slp_decision (loop_vec_info loop_vinfo)
1924 {
1925 unsigned int i, unrolling_factor = 1;
1926 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1927 slp_instance instance;
1928 int decided_to_slp = 0;
1929
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1932 "\n");
1933
1934 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1935 {
1936 /* FORNOW: SLP if you can. */
1937 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1938 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1939
1940 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1941 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1942 loop-based vectorization. Such stmts will be marked as HYBRID. */
1943 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1944 decided_to_slp++;
1945 }
1946
1947 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1948
1949 if (decided_to_slp && dump_enabled_p ())
1950 dump_printf_loc (MSG_NOTE, vect_location,
1951 "Decided to SLP %d instances. Unrolling factor %d\n",
1952 decided_to_slp, unrolling_factor);
1953
1954 return (decided_to_slp > 0);
1955 }
1956
1957
1958 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1959 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1960
1961 static void
1962 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1963 {
1964 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1965 imm_use_iterator imm_iter;
1966 gimple use_stmt;
1967 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1968 slp_tree child;
1969 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1970 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 int j;
1972
1973 /* Propagate hybrid down the SLP tree. */
1974 if (stype == hybrid)
1975 ;
1976 else if (HYBRID_SLP_STMT (stmt_vinfo))
1977 stype = hybrid;
1978 else
1979 {
1980 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1981 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1982 /* We always get the pattern stmt here, but for immediate
1983 uses we have to use the LHS of the original stmt. */
1984 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1985 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1986 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1987 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1988 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1989 {
1990 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1991 continue;
1992 use_vinfo = vinfo_for_stmt (use_stmt);
1993 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1994 && STMT_VINFO_RELATED_STMT (use_vinfo))
1995 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1996 if (!STMT_SLP_TYPE (use_vinfo)
1997 && (STMT_VINFO_RELEVANT (use_vinfo)
1998 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1999 && !(gimple_code (use_stmt) == GIMPLE_PHI
2000 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2001 stype = hybrid;
2002 }
2003 }
2004
2005 if (stype == hybrid)
2006 {
2007 if (dump_enabled_p ())
2008 {
2009 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2010 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2011 }
2012 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2013 }
2014
2015 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2016 if (child)
2017 vect_detect_hybrid_slp_stmts (child, i, stype);
2018 }
2019
2020 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2021
2022 static tree
2023 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2024 {
2025 walk_stmt_info *wi = (walk_stmt_info *)data;
2026 struct loop *loopp = (struct loop *)wi->info;
2027
2028 if (wi->is_lhs)
2029 return NULL_TREE;
2030
2031 if (TREE_CODE (*tp) == SSA_NAME
2032 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2033 {
2034 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2035 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2036 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2037 {
2038 if (dump_enabled_p ())
2039 {
2040 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2041 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2042 }
2043 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2044 }
2045 }
2046
2047 return NULL_TREE;
2048 }
2049
2050 static tree
2051 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2052 walk_stmt_info *)
2053 {
2054 /* If the stmt is in a SLP instance then this isn't a reason
2055 to mark use definitions in other SLP instances as hybrid. */
2056 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2057 *handled = true;
2058 return NULL_TREE;
2059 }
2060
2061 /* Find stmts that must be both vectorized and SLPed. */
2062
2063 void
2064 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2065 {
2066 unsigned int i;
2067 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2068 slp_instance instance;
2069
2070 if (dump_enabled_p ())
2071 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2072 "\n");
2073
2074 /* First walk all pattern stmt in the loop and mark defs of uses as
2075 hybrid because immediate uses in them are not recorded. */
2076 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2077 {
2078 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2079 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2080 gsi_next (&gsi))
2081 {
2082 gimple stmt = gsi_stmt (gsi);
2083 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2084 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2085 {
2086 walk_stmt_info wi;
2087 memset (&wi, 0, sizeof (wi));
2088 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2089 gimple_stmt_iterator gsi2
2090 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2091 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2092 vect_detect_hybrid_slp_1, &wi);
2093 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2094 vect_detect_hybrid_slp_2,
2095 vect_detect_hybrid_slp_1, &wi);
2096 }
2097 }
2098 }
2099
2100 /* Then walk the SLP instance trees marking stmts with uses in
2101 non-SLP stmts as hybrid, also propagating hybrid down the
2102 SLP tree, collecting the above info on-the-fly. */
2103 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2104 {
2105 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2106 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2107 i, pure_slp);
2108 }
2109 }
2110
2111
2112 /* Create and initialize a new bb_vec_info struct for BB, as well as
2113 stmt_vec_info structs for all the stmts in it. */
2114
2115 static bb_vec_info
2116 new_bb_vec_info (basic_block bb)
2117 {
2118 bb_vec_info res = NULL;
2119 gimple_stmt_iterator gsi;
2120
2121 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2122 BB_VINFO_BB (res) = bb;
2123
2124 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2125 {
2126 gimple stmt = gsi_stmt (gsi);
2127 gimple_set_uid (stmt, 0);
2128 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2129 }
2130
2131 BB_VINFO_GROUPED_STORES (res).create (10);
2132 BB_VINFO_SLP_INSTANCES (res).create (2);
2133 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2134
2135 bb->aux = res;
2136 return res;
2137 }
2138
2139
2140 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2141 stmts in the basic block. */
2142
2143 static void
2144 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2145 {
2146 vec<slp_instance> slp_instances;
2147 slp_instance instance;
2148 basic_block bb;
2149 gimple_stmt_iterator si;
2150 unsigned i;
2151
2152 if (!bb_vinfo)
2153 return;
2154
2155 bb = BB_VINFO_BB (bb_vinfo);
2156
2157 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2158 {
2159 gimple stmt = gsi_stmt (si);
2160 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2161
2162 if (stmt_info)
2163 /* Free stmt_vec_info. */
2164 free_stmt_vec_info (stmt);
2165 }
2166
2167 vect_destroy_datarefs (NULL, bb_vinfo);
2168 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2169 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2170 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2171 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2172 vect_free_slp_instance (instance);
2173 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2174 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2175 free (bb_vinfo);
2176 bb->aux = NULL;
2177 }
2178
2179
2180 /* Analyze statements contained in SLP tree node after recursively analyzing
2181 the subtree. Return TRUE if the operations are supported. */
2182
2183 static bool
2184 vect_slp_analyze_node_operations (slp_tree node)
2185 {
2186 bool dummy;
2187 int i;
2188 gimple stmt;
2189 slp_tree child;
2190
2191 if (!node)
2192 return true;
2193
2194 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2195 if (!vect_slp_analyze_node_operations (child))
2196 return false;
2197
2198 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2199 {
2200 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2201 gcc_assert (stmt_info);
2202 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2203
2204 if (!vect_analyze_stmt (stmt, &dummy, node))
2205 return false;
2206 }
2207
2208 return true;
2209 }
2210
2211
2212 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2213 operations are supported. */
2214
2215 bool
2216 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2217 {
2218 slp_instance instance;
2219 int i;
2220
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_NOTE, vect_location,
2223 "=== vect_slp_analyze_operations ===\n");
2224
2225 for (i = 0; slp_instances.iterate (i, &instance); )
2226 {
2227 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2228 {
2229 dump_printf_loc (MSG_NOTE, vect_location,
2230 "removing SLP instance operations starting from: ");
2231 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2232 SLP_TREE_SCALAR_STMTS
2233 (SLP_INSTANCE_TREE (instance))[0], 0);
2234 vect_free_slp_instance (instance);
2235 slp_instances.ordered_remove (i);
2236 }
2237 else
2238 {
2239 /* Compute the costs of the SLP instance. */
2240 vect_analyze_slp_cost (instance, data);
2241 i++;
2242 }
2243 }
2244
2245 if (!slp_instances.length ())
2246 return false;
2247
2248 return true;
2249 }
2250
2251
2252 /* Compute the scalar cost of the SLP node NODE and its children
2253 and return it. Do not account defs that are marked in LIFE and
2254 update LIFE according to uses of NODE. */
2255
2256 static unsigned
2257 vect_bb_slp_scalar_cost (basic_block bb,
2258 slp_tree node, vec<bool, va_heap> *life)
2259 {
2260 unsigned scalar_cost = 0;
2261 unsigned i;
2262 gimple stmt;
2263 slp_tree child;
2264
2265 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2266 {
2267 unsigned stmt_cost;
2268 ssa_op_iter op_iter;
2269 def_operand_p def_p;
2270 stmt_vec_info stmt_info;
2271
2272 if ((*life)[i])
2273 continue;
2274
2275 /* If there is a non-vectorized use of the defs then the scalar
2276 stmt is kept live in which case we do not account it or any
2277 required defs in the SLP children in the scalar cost. This
2278 way we make the vectorization more costly when compared to
2279 the scalar cost. */
2280 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2281 {
2282 imm_use_iterator use_iter;
2283 gimple use_stmt;
2284 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2285 if (!is_gimple_debug (use_stmt)
2286 && (gimple_code (use_stmt) == GIMPLE_PHI
2287 || gimple_bb (use_stmt) != bb
2288 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2289 {
2290 (*life)[i] = true;
2291 BREAK_FROM_IMM_USE_STMT (use_iter);
2292 }
2293 }
2294 if ((*life)[i])
2295 continue;
2296
2297 stmt_info = vinfo_for_stmt (stmt);
2298 if (STMT_VINFO_DATA_REF (stmt_info))
2299 {
2300 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2301 stmt_cost = vect_get_stmt_cost (scalar_load);
2302 else
2303 stmt_cost = vect_get_stmt_cost (scalar_store);
2304 }
2305 else
2306 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2307
2308 scalar_cost += stmt_cost;
2309 }
2310
2311 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2312 if (child)
2313 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2314
2315 return scalar_cost;
2316 }
2317
2318 /* Check if vectorization of the basic block is profitable. */
2319
2320 static bool
2321 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2322 {
2323 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2324 slp_instance instance;
2325 int i;
2326 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2327 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2328
2329 /* Calculate scalar cost. */
2330 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2331 {
2332 auto_vec<bool, 20> life;
2333 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2334 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2335 SLP_INSTANCE_TREE (instance),
2336 &life);
2337 }
2338
2339 /* Complete the target-specific cost calculation. */
2340 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2341 &vec_inside_cost, &vec_epilogue_cost);
2342
2343 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2344
2345 if (dump_enabled_p ())
2346 {
2347 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2348 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2349 vec_inside_cost);
2350 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2351 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2352 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2353 }
2354
2355 /* Vectorization is profitable if its cost is less than the cost of scalar
2356 version. */
2357 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2358 return false;
2359
2360 return true;
2361 }
2362
2363 /* Check if the basic block can be vectorized. */
2364
2365 static bb_vec_info
2366 vect_slp_analyze_bb_1 (basic_block bb)
2367 {
2368 bb_vec_info bb_vinfo;
2369 vec<slp_instance> slp_instances;
2370 slp_instance instance;
2371 int i;
2372 int min_vf = 2;
2373 unsigned n_stmts = 0;
2374
2375 bb_vinfo = new_bb_vec_info (bb);
2376 if (!bb_vinfo)
2377 return NULL;
2378
2379 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2380 {
2381 if (dump_enabled_p ())
2382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383 "not vectorized: unhandled data-ref in basic "
2384 "block.\n");
2385
2386 destroy_bb_vec_info (bb_vinfo);
2387 return NULL;
2388 }
2389
2390 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2391 {
2392 if (dump_enabled_p ())
2393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2394 "not vectorized: not enough data-refs in "
2395 "basic block.\n");
2396
2397 destroy_bb_vec_info (bb_vinfo);
2398 return NULL;
2399 }
2400
2401 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2402 {
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "not vectorized: unhandled data access in "
2406 "basic block.\n");
2407
2408 destroy_bb_vec_info (bb_vinfo);
2409 return NULL;
2410 }
2411
2412 vect_pattern_recog (NULL, bb_vinfo);
2413
2414 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2415 {
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2418 "not vectorized: bad data alignment in basic "
2419 "block.\n");
2420
2421 destroy_bb_vec_info (bb_vinfo);
2422 return NULL;
2423 }
2424
2425 /* Check the SLP opportunities in the basic block, analyze and build SLP
2426 trees. */
2427 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2428 {
2429 if (dump_enabled_p ())
2430 {
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "Failed to SLP the basic block.\n");
2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434 "not vectorized: failed to find SLP opportunities "
2435 "in basic block.\n");
2436 }
2437
2438 destroy_bb_vec_info (bb_vinfo);
2439 return NULL;
2440 }
2441
2442 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2443
2444 /* Mark all the statements that we want to vectorize as pure SLP and
2445 relevant. */
2446 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2447 {
2448 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2449 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2450 }
2451
2452 /* Mark all the statements that we do not want to vectorize. */
2453 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2454 !gsi_end_p (gsi); gsi_next (&gsi))
2455 {
2456 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2457 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2458 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2459 }
2460
2461 /* Analyze dependences. At this point all stmts not participating in
2462 vectorization have to be marked. Dependence analysis assumes
2463 that we either vectorize all SLP instances or none at all. */
2464 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2465 {
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468 "not vectorized: unhandled data dependence "
2469 "in basic block.\n");
2470
2471 destroy_bb_vec_info (bb_vinfo);
2472 return NULL;
2473 }
2474
2475 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2476 {
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2479 "not vectorized: unsupported alignment in basic "
2480 "block.\n");
2481 destroy_bb_vec_info (bb_vinfo);
2482 return NULL;
2483 }
2484
2485 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2486 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2487 {
2488 if (dump_enabled_p ())
2489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2490 "not vectorized: bad operation in basic block.\n");
2491
2492 destroy_bb_vec_info (bb_vinfo);
2493 return NULL;
2494 }
2495
2496 /* Cost model: check if the vectorization is worthwhile. */
2497 if (!unlimited_cost_model (NULL)
2498 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2499 {
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502 "not vectorized: vectorization is not "
2503 "profitable.\n");
2504
2505 destroy_bb_vec_info (bb_vinfo);
2506 return NULL;
2507 }
2508
2509 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_NOTE, vect_location,
2511 "Basic block will be vectorized using SLP\n");
2512
2513 return bb_vinfo;
2514 }
2515
2516
2517 bb_vec_info
2518 vect_slp_analyze_bb (basic_block bb)
2519 {
2520 bb_vec_info bb_vinfo;
2521 int insns = 0;
2522 gimple_stmt_iterator gsi;
2523 unsigned int vector_sizes;
2524
2525 if (dump_enabled_p ())
2526 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2527
2528 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2529 {
2530 gimple stmt = gsi_stmt (gsi);
2531 if (!is_gimple_debug (stmt)
2532 && !gimple_nop_p (stmt)
2533 && gimple_code (stmt) != GIMPLE_LABEL)
2534 insns++;
2535 }
2536
2537 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2538 {
2539 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2541 "not vectorized: too many instructions in "
2542 "basic block.\n");
2543
2544 return NULL;
2545 }
2546
2547 /* Autodetect first vector size we try. */
2548 current_vector_size = 0;
2549 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2550
2551 while (1)
2552 {
2553 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2554 if (bb_vinfo)
2555 return bb_vinfo;
2556
2557 destroy_bb_vec_info (bb_vinfo);
2558
2559 vector_sizes &= ~current_vector_size;
2560 if (vector_sizes == 0
2561 || current_vector_size == 0)
2562 return NULL;
2563
2564 /* Try the next biggest vector size. */
2565 current_vector_size = 1 << floor_log2 (vector_sizes);
2566 if (dump_enabled_p ())
2567 dump_printf_loc (MSG_NOTE, vect_location,
2568 "***** Re-trying analysis with "
2569 "vector size %d\n", current_vector_size);
2570 }
2571 }
2572
2573
2574 /* For constant and loop invariant defs of SLP_NODE this function returns
2575 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2576 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2577 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2578 REDUC_INDEX is the index of the reduction operand in the statements, unless
2579 it is -1. */
2580
2581 static void
2582 vect_get_constant_vectors (tree op, slp_tree slp_node,
2583 vec<tree> *vec_oprnds,
2584 unsigned int op_num, unsigned int number_of_vectors,
2585 int reduc_index)
2586 {
2587 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2588 gimple stmt = stmts[0];
2589 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2590 unsigned nunits;
2591 tree vec_cst;
2592 tree *elts;
2593 unsigned j, number_of_places_left_in_vector;
2594 tree vector_type;
2595 tree vop;
2596 int group_size = stmts.length ();
2597 unsigned int vec_num, i;
2598 unsigned number_of_copies = 1;
2599 vec<tree> voprnds;
2600 voprnds.create (number_of_vectors);
2601 bool constant_p, is_store;
2602 tree neutral_op = NULL;
2603 enum tree_code code = gimple_expr_code (stmt);
2604 gimple def_stmt;
2605 struct loop *loop;
2606 gimple_seq ctor_seq = NULL;
2607
2608 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2609 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2610
2611 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2612 && reduc_index != -1)
2613 {
2614 op_num = reduc_index;
2615 op = gimple_op (stmt, op_num + 1);
2616 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2617 we need either neutral operands or the original operands. See
2618 get_initial_def_for_reduction() for details. */
2619 switch (code)
2620 {
2621 case WIDEN_SUM_EXPR:
2622 case DOT_PROD_EXPR:
2623 case SAD_EXPR:
2624 case PLUS_EXPR:
2625 case MINUS_EXPR:
2626 case BIT_IOR_EXPR:
2627 case BIT_XOR_EXPR:
2628 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2629 neutral_op = build_real (TREE_TYPE (op), dconst0);
2630 else
2631 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2632
2633 break;
2634
2635 case MULT_EXPR:
2636 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2637 neutral_op = build_real (TREE_TYPE (op), dconst1);
2638 else
2639 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2640
2641 break;
2642
2643 case BIT_AND_EXPR:
2644 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2645 break;
2646
2647 /* For MIN/MAX we don't have an easy neutral operand but
2648 the initial values can be used fine here. Only for
2649 a reduction chain we have to force a neutral element. */
2650 case MAX_EXPR:
2651 case MIN_EXPR:
2652 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2653 neutral_op = NULL;
2654 else
2655 {
2656 def_stmt = SSA_NAME_DEF_STMT (op);
2657 loop = (gimple_bb (stmt))->loop_father;
2658 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2659 loop_preheader_edge (loop));
2660 }
2661 break;
2662
2663 default:
2664 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2665 neutral_op = NULL;
2666 }
2667 }
2668
2669 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2670 {
2671 is_store = true;
2672 op = gimple_assign_rhs1 (stmt);
2673 }
2674 else
2675 is_store = false;
2676
2677 gcc_assert (op);
2678
2679 if (CONSTANT_CLASS_P (op))
2680 constant_p = true;
2681 else
2682 constant_p = false;
2683
2684 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2685 created vectors. It is greater than 1 if unrolling is performed.
2686
2687 For example, we have two scalar operands, s1 and s2 (e.g., group of
2688 strided accesses of size two), while NUNITS is four (i.e., four scalars
2689 of this type can be packed in a vector). The output vector will contain
2690 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2691 will be 2).
2692
2693 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2694 containing the operands.
2695
2696 For example, NUNITS is four as before, and the group size is 8
2697 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2698 {s5, s6, s7, s8}. */
2699
2700 number_of_copies = nunits * number_of_vectors / group_size;
2701
2702 number_of_places_left_in_vector = nunits;
2703 elts = XALLOCAVEC (tree, nunits);
2704 bool place_after_defs = false;
2705 for (j = 0; j < number_of_copies; j++)
2706 {
2707 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2708 {
2709 if (is_store)
2710 op = gimple_assign_rhs1 (stmt);
2711 else
2712 {
2713 switch (code)
2714 {
2715 case COND_EXPR:
2716 if (op_num == 0 || op_num == 1)
2717 {
2718 tree cond = gimple_assign_rhs1 (stmt);
2719 op = TREE_OPERAND (cond, op_num);
2720 }
2721 else
2722 {
2723 if (op_num == 2)
2724 op = gimple_assign_rhs2 (stmt);
2725 else
2726 op = gimple_assign_rhs3 (stmt);
2727 }
2728 break;
2729
2730 case CALL_EXPR:
2731 op = gimple_call_arg (stmt, op_num);
2732 break;
2733
2734 case LSHIFT_EXPR:
2735 case RSHIFT_EXPR:
2736 case LROTATE_EXPR:
2737 case RROTATE_EXPR:
2738 op = gimple_op (stmt, op_num + 1);
2739 /* Unlike the other binary operators, shifts/rotates have
2740 the shift count being int, instead of the same type as
2741 the lhs, so make sure the scalar is the right type if
2742 we are dealing with vectors of
2743 long long/long/short/char. */
2744 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2745 op = fold_convert (TREE_TYPE (vector_type), op);
2746 break;
2747
2748 default:
2749 op = gimple_op (stmt, op_num + 1);
2750 break;
2751 }
2752 }
2753
2754 if (reduc_index != -1)
2755 {
2756 loop = (gimple_bb (stmt))->loop_father;
2757 def_stmt = SSA_NAME_DEF_STMT (op);
2758
2759 gcc_assert (loop);
2760
2761 /* Get the def before the loop. In reduction chain we have only
2762 one initial value. */
2763 if ((j != (number_of_copies - 1)
2764 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2765 && i != 0))
2766 && neutral_op)
2767 op = neutral_op;
2768 else
2769 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2770 loop_preheader_edge (loop));
2771 }
2772
2773 /* Create 'vect_ = {op0,op1,...,opn}'. */
2774 number_of_places_left_in_vector--;
2775 tree orig_op = op;
2776 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2777 {
2778 if (CONSTANT_CLASS_P (op))
2779 {
2780 op = fold_unary (VIEW_CONVERT_EXPR,
2781 TREE_TYPE (vector_type), op);
2782 gcc_assert (op && CONSTANT_CLASS_P (op));
2783 }
2784 else
2785 {
2786 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2787 gimple init_stmt;
2788 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2789 init_stmt
2790 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2791 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2792 op = new_temp;
2793 }
2794 }
2795 elts[number_of_places_left_in_vector] = op;
2796 if (!CONSTANT_CLASS_P (op))
2797 constant_p = false;
2798 if (TREE_CODE (orig_op) == SSA_NAME
2799 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2800 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2801 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2802 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2803 place_after_defs = true;
2804
2805 if (number_of_places_left_in_vector == 0)
2806 {
2807 number_of_places_left_in_vector = nunits;
2808
2809 if (constant_p)
2810 vec_cst = build_vector (vector_type, elts);
2811 else
2812 {
2813 vec<constructor_elt, va_gc> *v;
2814 unsigned k;
2815 vec_alloc (v, nunits);
2816 for (k = 0; k < nunits; ++k)
2817 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2818 vec_cst = build_constructor (vector_type, v);
2819 }
2820 tree init;
2821 gimple_stmt_iterator gsi;
2822 if (place_after_defs)
2823 {
2824 gsi = gsi_for_stmt
2825 (vect_find_last_scalar_stmt_in_slp (slp_node));
2826 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2827 }
2828 else
2829 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2830 if (ctor_seq != NULL)
2831 {
2832 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2833 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2834 GSI_SAME_STMT);
2835 ctor_seq = NULL;
2836 }
2837 voprnds.quick_push (init);
2838 place_after_defs = false;
2839 }
2840 }
2841 }
2842
2843 /* Since the vectors are created in the reverse order, we should invert
2844 them. */
2845 vec_num = voprnds.length ();
2846 for (j = vec_num; j != 0; j--)
2847 {
2848 vop = voprnds[j - 1];
2849 vec_oprnds->quick_push (vop);
2850 }
2851
2852 voprnds.release ();
2853
2854 /* In case that VF is greater than the unrolling factor needed for the SLP
2855 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2856 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2857 to replicate the vectors. */
2858 while (number_of_vectors > vec_oprnds->length ())
2859 {
2860 tree neutral_vec = NULL;
2861
2862 if (neutral_op)
2863 {
2864 if (!neutral_vec)
2865 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2866
2867 vec_oprnds->quick_push (neutral_vec);
2868 }
2869 else
2870 {
2871 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2872 vec_oprnds->quick_push (vop);
2873 }
2874 }
2875 }
2876
2877
2878 /* Get vectorized definitions from SLP_NODE that contains corresponding
2879 vectorized def-stmts. */
2880
2881 static void
2882 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2883 {
2884 tree vec_oprnd;
2885 gimple vec_def_stmt;
2886 unsigned int i;
2887
2888 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2889
2890 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2891 {
2892 gcc_assert (vec_def_stmt);
2893 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2894 vec_oprnds->quick_push (vec_oprnd);
2895 }
2896 }
2897
2898
2899 /* Get vectorized definitions for SLP_NODE.
2900 If the scalar definitions are loop invariants or constants, collect them and
2901 call vect_get_constant_vectors() to create vector stmts.
2902 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2903 must be stored in the corresponding child of SLP_NODE, and we call
2904 vect_get_slp_vect_defs () to retrieve them. */
2905
2906 void
2907 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2908 vec<vec<tree> > *vec_oprnds, int reduc_index)
2909 {
2910 gimple first_stmt;
2911 int number_of_vects = 0, i;
2912 unsigned int child_index = 0;
2913 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2914 slp_tree child = NULL;
2915 vec<tree> vec_defs;
2916 tree oprnd;
2917 bool vectorized_defs;
2918
2919 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2920 FOR_EACH_VEC_ELT (ops, i, oprnd)
2921 {
2922 /* For each operand we check if it has vectorized definitions in a child
2923 node or we need to create them (for invariants and constants). We
2924 check if the LHS of the first stmt of the next child matches OPRND.
2925 If it does, we found the correct child. Otherwise, we call
2926 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2927 to check this child node for the next operand. */
2928 vectorized_defs = false;
2929 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2930 {
2931 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2932
2933 /* We have to check both pattern and original def, if available. */
2934 if (child)
2935 {
2936 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2937 gimple related
2938 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2939
2940 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2941 || (related
2942 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2943 {
2944 /* The number of vector defs is determined by the number of
2945 vector statements in the node from which we get those
2946 statements. */
2947 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2948 vectorized_defs = true;
2949 child_index++;
2950 }
2951 }
2952 else
2953 child_index++;
2954 }
2955
2956 if (!vectorized_defs)
2957 {
2958 if (i == 0)
2959 {
2960 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2961 /* Number of vector stmts was calculated according to LHS in
2962 vect_schedule_slp_instance (), fix it by replacing LHS with
2963 RHS, if necessary. See vect_get_smallest_scalar_type () for
2964 details. */
2965 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2966 &rhs_size_unit);
2967 if (rhs_size_unit != lhs_size_unit)
2968 {
2969 number_of_vects *= rhs_size_unit;
2970 number_of_vects /= lhs_size_unit;
2971 }
2972 }
2973 }
2974
2975 /* Allocate memory for vectorized defs. */
2976 vec_defs = vNULL;
2977 vec_defs.create (number_of_vects);
2978
2979 /* For reduction defs we call vect_get_constant_vectors (), since we are
2980 looking for initial loop invariant values. */
2981 if (vectorized_defs && reduc_index == -1)
2982 /* The defs are already vectorized. */
2983 vect_get_slp_vect_defs (child, &vec_defs);
2984 else
2985 /* Build vectors from scalar defs. */
2986 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2987 number_of_vects, reduc_index);
2988
2989 vec_oprnds->quick_push (vec_defs);
2990
2991 /* For reductions, we only need initial values. */
2992 if (reduc_index != -1)
2993 return;
2994 }
2995 }
2996
2997
2998 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2999 building a vector of type MASK_TYPE from it) and two input vectors placed in
3000 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3001 shifting by STRIDE elements of DR_CHAIN for every copy.
3002 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3003 copies).
3004 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3005 the created stmts must be inserted. */
3006
3007 static inline void
3008 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3009 tree mask, int first_vec_indx, int second_vec_indx,
3010 gimple_stmt_iterator *gsi, slp_tree node,
3011 tree vectype, vec<tree> dr_chain,
3012 int ncopies, int vect_stmts_counter)
3013 {
3014 tree perm_dest;
3015 gimple perm_stmt = NULL;
3016 stmt_vec_info next_stmt_info;
3017 int i, stride;
3018 tree first_vec, second_vec, data_ref;
3019
3020 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3021
3022 /* Initialize the vect stmts of NODE to properly insert the generated
3023 stmts later. */
3024 for (i = SLP_TREE_VEC_STMTS (node).length ();
3025 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3026 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3027
3028 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3029 for (i = 0; i < ncopies; i++)
3030 {
3031 first_vec = dr_chain[first_vec_indx];
3032 second_vec = dr_chain[second_vec_indx];
3033
3034 /* Generate the permute statement. */
3035 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3036 first_vec, second_vec, mask);
3037 data_ref = make_ssa_name (perm_dest, perm_stmt);
3038 gimple_set_lhs (perm_stmt, data_ref);
3039 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3040
3041 /* Store the vector statement in NODE. */
3042 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3043
3044 first_vec_indx += stride;
3045 second_vec_indx += stride;
3046 }
3047
3048 /* Mark the scalar stmt as vectorized. */
3049 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3050 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3051 }
3052
3053
3054 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3055 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3056 representation. Check that the mask is valid and return FALSE if not.
3057 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3058 the next vector, i.e., the current first vector is not needed. */
3059
3060 static bool
3061 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3062 int mask_nunits, bool only_one_vec, int index,
3063 unsigned char *mask, int *current_mask_element,
3064 bool *need_next_vector, int *number_of_mask_fixes,
3065 bool *mask_fixed, bool *needs_first_vector)
3066 {
3067 int i;
3068
3069 /* Convert to target specific representation. */
3070 *current_mask_element = first_mask_element + m;
3071 /* Adjust the value in case it's a mask for second and third vectors. */
3072 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3073
3074 if (*current_mask_element < 0)
3075 {
3076 if (dump_enabled_p ())
3077 {
3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079 "permutation requires past vector ");
3080 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3081 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3082 }
3083 return false;
3084 }
3085
3086 if (*current_mask_element < mask_nunits)
3087 *needs_first_vector = true;
3088
3089 /* We have only one input vector to permute but the mask accesses values in
3090 the next vector as well. */
3091 if (only_one_vec && *current_mask_element >= mask_nunits)
3092 {
3093 if (dump_enabled_p ())
3094 {
3095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3096 "permutation requires at least two vectors ");
3097 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3098 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3099 }
3100
3101 return false;
3102 }
3103
3104 /* The mask requires the next vector. */
3105 while (*current_mask_element >= mask_nunits * 2)
3106 {
3107 if (*needs_first_vector || *mask_fixed)
3108 {
3109 /* We either need the first vector too or have already moved to the
3110 next vector. In both cases, this permutation needs three
3111 vectors. */
3112 if (dump_enabled_p ())
3113 {
3114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3115 "permutation requires at "
3116 "least three vectors ");
3117 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3118 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3119 }
3120
3121 return false;
3122 }
3123
3124 /* We move to the next vector, dropping the first one and working with
3125 the second and the third - we need to adjust the values of the mask
3126 accordingly. */
3127 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3128
3129 for (i = 0; i < index; i++)
3130 mask[i] -= mask_nunits * *number_of_mask_fixes;
3131
3132 (*number_of_mask_fixes)++;
3133 *mask_fixed = true;
3134 }
3135
3136 *need_next_vector = *mask_fixed;
3137
3138 /* This was the last element of this mask. Start a new one. */
3139 if (index == mask_nunits - 1)
3140 {
3141 *number_of_mask_fixes = 1;
3142 *mask_fixed = false;
3143 *needs_first_vector = false;
3144 }
3145
3146 return true;
3147 }
3148
3149
3150 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3151 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3152 permute statements for the SLP node NODE of the SLP instance
3153 SLP_NODE_INSTANCE. */
3154
3155 bool
3156 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3157 gimple_stmt_iterator *gsi, int vf,
3158 slp_instance slp_node_instance, bool analyze_only)
3159 {
3160 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3161 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3162 tree mask_element_type = NULL_TREE, mask_type;
3163 int i, j, k, nunits, vec_index = 0, scalar_index;
3164 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3165 gimple next_scalar_stmt;
3166 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3167 int first_mask_element;
3168 int index, unroll_factor, current_mask_element, ncopies;
3169 unsigned char *mask;
3170 bool only_one_vec = false, need_next_vector = false;
3171 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3172 int number_of_mask_fixes = 1;
3173 bool mask_fixed = false;
3174 bool needs_first_vector = false;
3175 machine_mode mode;
3176
3177 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3178 return false;
3179
3180 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3181
3182 mode = TYPE_MODE (vectype);
3183
3184 if (!can_vec_perm_p (mode, false, NULL))
3185 {
3186 if (dump_enabled_p ())
3187 {
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "no vect permute for ");
3190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3191 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3192 }
3193 return false;
3194 }
3195
3196 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3197 same size as the vector element being permuted. */
3198 mask_element_type = lang_hooks.types.type_for_mode
3199 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3200 mask_type = get_vectype_for_scalar_type (mask_element_type);
3201 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3202 mask = XALLOCAVEC (unsigned char, nunits);
3203 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3204
3205 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3206 unrolling factor. */
3207 orig_vec_stmts_num
3208 = (STMT_VINFO_GROUP_SIZE (stmt_info)
3209 * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
3210 + nunits - 1) / nunits;
3211 if (orig_vec_stmts_num == 1)
3212 only_one_vec = true;
3213
3214 /* Number of copies is determined by the final vectorization factor
3215 relatively to SLP_NODE_INSTANCE unrolling factor. */
3216 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3217
3218 /* Generate permutation masks for every NODE. Number of masks for each NODE
3219 is equal to GROUP_SIZE.
3220 E.g., we have a group of three nodes with three loads from the same
3221 location in each node, and the vector size is 4. I.e., we have a
3222 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3223 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3224 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3225 ...
3226
3227 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3228 The last mask is illegal since we assume two operands for permute
3229 operation, and the mask element values can't be outside that range.
3230 Hence, the last mask must be converted into {2,5,5,5}.
3231 For the first two permutations we need the first and the second input
3232 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3233 we need the second and the third vectors: {b1,c1,a2,b2} and
3234 {c2,a3,b3,c3}. */
3235
3236 {
3237 scalar_index = 0;
3238 index = 0;
3239 vect_stmts_counter = 0;
3240 vec_index = 0;
3241 first_vec_index = vec_index++;
3242 if (only_one_vec)
3243 second_vec_index = first_vec_index;
3244 else
3245 second_vec_index = vec_index++;
3246
3247 for (j = 0; j < unroll_factor; j++)
3248 {
3249 for (k = 0; k < group_size; k++)
3250 {
3251 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3252 first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3253 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3254 nunits, only_one_vec, index,
3255 mask, &current_mask_element,
3256 &need_next_vector,
3257 &number_of_mask_fixes, &mask_fixed,
3258 &needs_first_vector))
3259 return false;
3260 gcc_assert (current_mask_element >= 0
3261 && current_mask_element < 2 * nunits);
3262 mask[index++] = current_mask_element;
3263
3264 if (index == nunits)
3265 {
3266 index = 0;
3267 if (!can_vec_perm_p (mode, false, mask))
3268 {
3269 if (dump_enabled_p ())
3270 {
3271 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3272 vect_location,
3273 "unsupported vect permute { ");
3274 for (i = 0; i < nunits; ++i)
3275 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3276 mask[i]);
3277 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3278 }
3279 return false;
3280 }
3281
3282 if (!analyze_only)
3283 {
3284 int l;
3285 tree mask_vec, *mask_elts;
3286 mask_elts = XALLOCAVEC (tree, nunits);
3287 for (l = 0; l < nunits; ++l)
3288 mask_elts[l] = build_int_cst (mask_element_type,
3289 mask[l]);
3290 mask_vec = build_vector (mask_type, mask_elts);
3291
3292 if (need_next_vector)
3293 {
3294 first_vec_index = second_vec_index;
3295 second_vec_index = vec_index;
3296 }
3297
3298 next_scalar_stmt
3299 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3300
3301 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3302 mask_vec, first_vec_index, second_vec_index,
3303 gsi, node, vectype, dr_chain,
3304 ncopies, vect_stmts_counter++);
3305 }
3306 }
3307 }
3308 }
3309 }
3310
3311 return true;
3312 }
3313
3314
3315
3316 /* Vectorize SLP instance tree in postorder. */
3317
3318 static bool
3319 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3320 unsigned int vectorization_factor)
3321 {
3322 gimple stmt;
3323 bool grouped_store, is_store;
3324 gimple_stmt_iterator si;
3325 stmt_vec_info stmt_info;
3326 unsigned int vec_stmts_size, nunits, group_size;
3327 tree vectype;
3328 int i;
3329 slp_tree child;
3330
3331 if (!node)
3332 return false;
3333
3334 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3335 vect_schedule_slp_instance (child, instance, vectorization_factor);
3336
3337 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3338 stmt_info = vinfo_for_stmt (stmt);
3339
3340 /* VECTYPE is the type of the destination. */
3341 vectype = STMT_VINFO_VECTYPE (stmt_info);
3342 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3343 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3344
3345 /* For each SLP instance calculate number of vector stmts to be created
3346 for the scalar stmts in each node of the SLP tree. Number of vector
3347 elements in one vector iteration is the number of scalar elements in
3348 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3349 size.
3350 Unless this is a SLP reduction in which case the number of vector
3351 stmts is equal to the number of vector stmts of the children. */
3352 if (GROUP_FIRST_ELEMENT (stmt_info)
3353 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3354 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3355 else
3356 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3357
3358 if (!SLP_TREE_VEC_STMTS (node).exists ())
3359 {
3360 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3361 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3362 }
3363
3364 if (dump_enabled_p ())
3365 {
3366 dump_printf_loc (MSG_NOTE,vect_location,
3367 "------>vectorizing SLP node starting from: ");
3368 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3369 dump_printf (MSG_NOTE, "\n");
3370 }
3371
3372 /* Vectorized stmts go before the last scalar stmt which is where
3373 all uses are ready. */
3374 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3375
3376 /* Mark the first element of the reduction chain as reduction to properly
3377 transform the node. In the analysis phase only the last element of the
3378 chain is marked as reduction. */
3379 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3380 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3381 {
3382 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3383 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3384 }
3385
3386 /* Handle two-operation SLP nodes by vectorizing the group with
3387 both operations and then performing a merge. */
3388 if (SLP_TREE_TWO_OPERATORS (node))
3389 {
3390 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3391 enum tree_code ocode;
3392 gimple ostmt;
3393 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3394 bool allsame = true;
3395 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3396 if (gimple_assign_rhs_code (ostmt) != code0)
3397 {
3398 mask[i] = 1;
3399 allsame = false;
3400 ocode = gimple_assign_rhs_code (ostmt);
3401 }
3402 else
3403 mask[i] = 0;
3404 if (!allsame)
3405 {
3406 vec<gimple> v0;
3407 vec<gimple> v1;
3408 unsigned j;
3409 tree tmask = NULL_TREE;
3410 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3411 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3412 SLP_TREE_VEC_STMTS (node).truncate (0);
3413 gimple_assign_set_rhs_code (stmt, ocode);
3414 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3415 gimple_assign_set_rhs_code (stmt, code0);
3416 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3417 SLP_TREE_VEC_STMTS (node).truncate (0);
3418 tree meltype = build_nonstandard_integer_type
3419 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3420 tree mvectype = get_same_sized_vectype (meltype, vectype);
3421 unsigned k = 0, l;
3422 for (j = 0; j < v0.length (); ++j)
3423 {
3424 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3425 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3426 {
3427 if (k >= group_size)
3428 k = 0;
3429 melts[l] = build_int_cst
3430 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3431 }
3432 tmask = build_vector (mvectype, melts);
3433
3434 /* ??? Not all targets support a VEC_PERM_EXPR with a
3435 constant mask that would translate to a vec_merge RTX
3436 (with their vec_perm_const_ok). We can either not
3437 vectorize in that case or let veclower do its job.
3438 Unfortunately that isn't too great and at least for
3439 plus/minus we'd eventually like to match targets
3440 vector addsub instructions. */
3441 gimple vstmt;
3442 vstmt = gimple_build_assign (make_ssa_name (vectype),
3443 VEC_PERM_EXPR,
3444 gimple_assign_lhs (v0[j]),
3445 gimple_assign_lhs (v1[j]), tmask);
3446 vect_finish_stmt_generation (stmt, vstmt, &si);
3447 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3448 }
3449 v0.release ();
3450 v1.release ();
3451 return false;
3452 }
3453 }
3454 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3455 return is_store;
3456 }
3457
3458 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3459 For loop vectorization this is done in vectorizable_call, but for SLP
3460 it needs to be deferred until end of vect_schedule_slp, because multiple
3461 SLP instances may refer to the same scalar stmt. */
3462
3463 static void
3464 vect_remove_slp_scalar_calls (slp_tree node)
3465 {
3466 gimple stmt, new_stmt;
3467 gimple_stmt_iterator gsi;
3468 int i;
3469 slp_tree child;
3470 tree lhs;
3471 stmt_vec_info stmt_info;
3472
3473 if (!node)
3474 return;
3475
3476 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3477 vect_remove_slp_scalar_calls (child);
3478
3479 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3480 {
3481 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3482 continue;
3483 stmt_info = vinfo_for_stmt (stmt);
3484 if (stmt_info == NULL
3485 || is_pattern_stmt_p (stmt_info)
3486 || !PURE_SLP_STMT (stmt_info))
3487 continue;
3488 lhs = gimple_call_lhs (stmt);
3489 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3490 set_vinfo_for_stmt (new_stmt, stmt_info);
3491 set_vinfo_for_stmt (stmt, NULL);
3492 STMT_VINFO_STMT (stmt_info) = new_stmt;
3493 gsi = gsi_for_stmt (stmt);
3494 gsi_replace (&gsi, new_stmt, false);
3495 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3496 }
3497 }
3498
3499 /* Generate vector code for all SLP instances in the loop/basic block. */
3500
3501 bool
3502 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3503 {
3504 vec<slp_instance> slp_instances;
3505 slp_instance instance;
3506 unsigned int i, vf;
3507 bool is_store = false;
3508
3509 if (loop_vinfo)
3510 {
3511 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3512 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3513 }
3514 else
3515 {
3516 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3517 vf = 1;
3518 }
3519
3520 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3521 {
3522 /* Schedule the tree of INSTANCE. */
3523 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3524 instance, vf);
3525 if (dump_enabled_p ())
3526 dump_printf_loc (MSG_NOTE, vect_location,
3527 "vectorizing stmts using SLP.\n");
3528 }
3529
3530 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3531 {
3532 slp_tree root = SLP_INSTANCE_TREE (instance);
3533 gimple store;
3534 unsigned int j;
3535 gimple_stmt_iterator gsi;
3536
3537 /* Remove scalar call stmts. Do not do this for basic-block
3538 vectorization as not all uses may be vectorized.
3539 ??? Why should this be necessary? DCE should be able to
3540 remove the stmts itself.
3541 ??? For BB vectorization we can as well remove scalar
3542 stmts starting from the SLP tree root if they have no
3543 uses. */
3544 if (loop_vinfo)
3545 vect_remove_slp_scalar_calls (root);
3546
3547 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3548 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3549 {
3550 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3551 break;
3552
3553 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3554 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3555 /* Free the attached stmt_vec_info and remove the stmt. */
3556 gsi = gsi_for_stmt (store);
3557 unlink_stmt_vdef (store);
3558 gsi_remove (&gsi, true);
3559 release_defs (store);
3560 free_stmt_vec_info (store);
3561 }
3562 }
3563
3564 return is_store;
3565 }
3566
3567
3568 /* Vectorize the basic block. */
3569
3570 void
3571 vect_slp_transform_bb (basic_block bb)
3572 {
3573 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3574 gimple_stmt_iterator si;
3575
3576 gcc_assert (bb_vinfo);
3577
3578 if (dump_enabled_p ())
3579 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3580
3581 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3582 {
3583 gimple stmt = gsi_stmt (si);
3584 stmt_vec_info stmt_info;
3585
3586 if (dump_enabled_p ())
3587 {
3588 dump_printf_loc (MSG_NOTE, vect_location,
3589 "------>SLPing statement: ");
3590 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3591 dump_printf (MSG_NOTE, "\n");
3592 }
3593
3594 stmt_info = vinfo_for_stmt (stmt);
3595 gcc_assert (stmt_info);
3596
3597 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3598 if (STMT_SLP_TYPE (stmt_info))
3599 {
3600 vect_schedule_slp (NULL, bb_vinfo);
3601 break;
3602 }
3603 }
3604
3605 if (dump_enabled_p ())
3606 dump_printf_loc (MSG_NOTE, vect_location,
3607 "BASIC BLOCK VECTORIZED\n");
3608
3609 destroy_bb_vec_info (bb_vinfo);
3610 }