re PR tree-optimization/65961 (ice in vect_is_simple_use_1 with -O3)
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h" /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81 Return the basic block location if succeed and NULL if not. */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86 gimple stmt = NULL;
87 gimple_stmt_iterator si;
88
89 if (!bb)
90 return UNKNOWN_LOCATION;
91
92 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93 {
94 stmt = gsi_stmt (si);
95 if (gimple_location (stmt) != UNKNOWN_LOCATION)
96 return gimple_location (stmt);
97 }
98
99 return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108 int i;
109 slp_tree child;
110
111 if (!node)
112 return;
113
114 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115 vect_free_slp_tree (child);
116
117 SLP_TREE_CHILDREN (node).release ();
118 SLP_TREE_SCALAR_STMTS (node).release ();
119 SLP_TREE_VEC_STMTS (node).release ();
120 SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122 free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance. */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132 SLP_INSTANCE_LOADS (instance).release ();
133 free (instance);
134 }
135
136
137 /* Create an SLP node for SCALAR_STMTS. */
138
139 static slp_tree
140 vect_create_new_slp_node (vec<gimple> scalar_stmts)
141 {
142 slp_tree node;
143 gimple stmt = scalar_stmts[0];
144 unsigned int nops;
145
146 if (is_gimple_call (stmt))
147 nops = gimple_call_num_args (stmt);
148 else if (is_gimple_assign (stmt))
149 {
150 nops = gimple_num_ops (stmt) - 1;
151 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
152 nops++;
153 }
154 else
155 return NULL;
156
157 node = XNEW (struct _slp_tree);
158 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
159 SLP_TREE_VEC_STMTS (node).create (0);
160 SLP_TREE_CHILDREN (node).create (nops);
161 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
162 SLP_TREE_TWO_OPERATORS (node) = false;
163
164 return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169 operand. */
170 static vec<slp_oprnd_info>
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173 int i;
174 slp_oprnd_info oprnd_info;
175 vec<slp_oprnd_info> oprnds_info;
176
177 oprnds_info.create (nops);
178 for (i = 0; i < nops; i++)
179 {
180 oprnd_info = XNEW (struct _slp_oprnd_info);
181 oprnd_info->def_stmts.create (group_size);
182 oprnd_info->first_dt = vect_uninitialized_def;
183 oprnd_info->first_op_type = NULL_TREE;
184 oprnd_info->first_pattern = false;
185 oprnd_info->second_pattern = false;
186 oprnds_info.quick_push (oprnd_info);
187 }
188
189 return oprnds_info;
190 }
191
192
193 /* Free operands info. */
194
195 static void
196 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
197 {
198 int i;
199 slp_oprnd_info oprnd_info;
200
201 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
202 {
203 oprnd_info->def_stmts.release ();
204 XDELETE (oprnd_info);
205 }
206
207 oprnds_info.release ();
208 }
209
210
211 /* Find the place of the data-ref in STMT in the interleaving chain that starts
212 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
213
214 static int
215 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
216 {
217 gimple next_stmt = first_stmt;
218 int result = 0;
219
220 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
221 return -1;
222
223 do
224 {
225 if (next_stmt == stmt)
226 return result;
227 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228 if (next_stmt)
229 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
230 }
231 while (next_stmt);
232
233 return -1;
234 }
235
236
237 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
238 they are of a valid type and that they match the defs of the first stmt of
239 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
240 return -1, if the error could be corrected by swapping operands of the
241 operation return 1, if everything is ok return 0. */
242
243 static int
244 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
245 gimple stmt, unsigned stmt_num,
246 vec<slp_oprnd_info> *oprnds_info)
247 {
248 tree oprnd;
249 unsigned int i, number_of_oprnds;
250 tree def;
251 gimple def_stmt;
252 enum vect_def_type dt = vect_uninitialized_def;
253 struct loop *loop = NULL;
254 bool pattern = false;
255 slp_oprnd_info oprnd_info;
256 int first_op_idx = 1;
257 bool commutative = false;
258 bool first_op_cond = false;
259 bool first = stmt_num == 0;
260 bool second = stmt_num == 1;
261
262 if (loop_vinfo)
263 loop = LOOP_VINFO_LOOP (loop_vinfo);
264
265 if (is_gimple_call (stmt))
266 {
267 number_of_oprnds = gimple_call_num_args (stmt);
268 first_op_idx = 3;
269 }
270 else if (is_gimple_assign (stmt))
271 {
272 enum tree_code code = gimple_assign_rhs_code (stmt);
273 number_of_oprnds = gimple_num_ops (stmt) - 1;
274 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
275 {
276 first_op_cond = true;
277 commutative = true;
278 number_of_oprnds++;
279 }
280 else
281 commutative = commutative_tree_code (code);
282 }
283 else
284 return -1;
285
286 bool swapped = false;
287 for (i = 0; i < number_of_oprnds; i++)
288 {
289 again:
290 if (first_op_cond)
291 {
292 if (i == 0 || i == 1)
293 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
294 swapped ? !i : i);
295 else
296 oprnd = gimple_op (stmt, first_op_idx + i - 1);
297 }
298 else
299 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
300
301 oprnd_info = (*oprnds_info)[i];
302
303 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
304 &def, &dt))
305 {
306 if (dump_enabled_p ())
307 {
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "Build SLP failed: can't analyze def for ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
311 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
312 }
313
314 return -1;
315 }
316
317 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
318 from the pattern. Check that all the stmts of the node are in the
319 pattern. */
320 if (def_stmt && gimple_bb (def_stmt)
321 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
322 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
323 && gimple_code (def_stmt) != GIMPLE_PHI))
324 && vinfo_for_stmt (def_stmt)
325 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
326 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
327 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
328 {
329 pattern = true;
330 if (!first && !oprnd_info->first_pattern
331 /* Allow different pattern state for the defs of the
332 first stmt in reduction chains. */
333 && (oprnd_info->first_dt != vect_reduction_def
334 || (!second && !oprnd_info->second_pattern)))
335 {
336 if (i == 0
337 && !swapped
338 && commutative)
339 {
340 swapped = true;
341 goto again;
342 }
343
344 if (dump_enabled_p ())
345 {
346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 "Build SLP failed: some of the stmts"
348 " are in a pattern, and others are not ");
349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
350 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
351 }
352
353 return 1;
354 }
355
356 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
357 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
358
359 if (dt == vect_unknown_def_type)
360 {
361 if (dump_enabled_p ())
362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
363 "Unsupported pattern.\n");
364 return -1;
365 }
366
367 switch (gimple_code (def_stmt))
368 {
369 case GIMPLE_PHI:
370 def = gimple_phi_result (def_stmt);
371 break;
372
373 case GIMPLE_ASSIGN:
374 def = gimple_assign_lhs (def_stmt);
375 break;
376
377 default:
378 if (dump_enabled_p ())
379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
380 "unsupported defining stmt:\n");
381 return -1;
382 }
383 }
384
385 if (second)
386 oprnd_info->second_pattern = pattern;
387
388 if (first)
389 {
390 oprnd_info->first_dt = dt;
391 oprnd_info->first_pattern = pattern;
392 oprnd_info->first_op_type = TREE_TYPE (oprnd);
393 }
394 else
395 {
396 /* Not first stmt of the group, check that the def-stmt/s match
397 the def-stmt/s of the first stmt. Allow different definition
398 types for reduction chains: the first stmt must be a
399 vect_reduction_def (a phi node), and the rest
400 vect_internal_def. */
401 if (((oprnd_info->first_dt != dt
402 && !(oprnd_info->first_dt == vect_reduction_def
403 && dt == vect_internal_def)
404 && !((oprnd_info->first_dt == vect_external_def
405 || oprnd_info->first_dt == vect_constant_def)
406 && (dt == vect_external_def
407 || dt == vect_constant_def)))
408 || !types_compatible_p (oprnd_info->first_op_type,
409 TREE_TYPE (oprnd))))
410 {
411 /* Try swapping operands if we got a mismatch. */
412 if (i == 0
413 && !swapped
414 && commutative)
415 {
416 swapped = true;
417 goto again;
418 }
419
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "Build SLP failed: different types\n");
423
424 return 1;
425 }
426 }
427
428 /* Check the types of the definitions. */
429 switch (dt)
430 {
431 case vect_constant_def:
432 case vect_external_def:
433 case vect_reduction_def:
434 break;
435
436 case vect_internal_def:
437 oprnd_info->def_stmts.quick_push (def_stmt);
438 break;
439
440 default:
441 /* FORNOW: Not supported. */
442 if (dump_enabled_p ())
443 {
444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
445 "Build SLP failed: illegal type of def ");
446 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
447 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448 }
449
450 return -1;
451 }
452 }
453
454 /* Swap operands. */
455 if (swapped)
456 {
457 if (first_op_cond)
458 {
459 tree cond = gimple_assign_rhs1 (stmt);
460 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
461 &TREE_OPERAND (cond, 1));
462 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
463 }
464 else
465 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
466 gimple_assign_rhs2_ptr (stmt));
467 }
468
469 return 0;
470 }
471
472
473 /* Verify if the scalar stmts STMTS are isomorphic, require data
474 permutation or are of unsupported types of operation. Return
475 true if they are, otherwise return false and indicate in *MATCHES
476 which stmts are not isomorphic to the first one. If MATCHES[0]
477 is false then this indicates the comparison could not be
478 carried out or the stmts will never be vectorized by SLP. */
479
480 static bool
481 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
482 vec<gimple> stmts, unsigned int group_size,
483 unsigned nops, unsigned int *max_nunits,
484 unsigned int vectorization_factor, bool *matches,
485 bool *two_operators)
486 {
487 unsigned int i;
488 gimple first_stmt = stmts[0], stmt = stmts[0];
489 enum tree_code first_stmt_code = ERROR_MARK;
490 enum tree_code alt_stmt_code = ERROR_MARK;
491 enum tree_code rhs_code = ERROR_MARK;
492 enum tree_code first_cond_code = ERROR_MARK;
493 tree lhs;
494 bool need_same_oprnds = false;
495 tree vectype, scalar_type, first_op1 = NULL_TREE;
496 optab optab;
497 int icode;
498 machine_mode optab_op2_mode;
499 machine_mode vec_mode;
500 struct data_reference *first_dr;
501 HOST_WIDE_INT dummy;
502 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
503 tree cond;
504
505 /* For every stmt in NODE find its def stmt/s. */
506 FOR_EACH_VEC_ELT (stmts, i, stmt)
507 {
508 matches[i] = false;
509
510 if (dump_enabled_p ())
511 {
512 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
513 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
514 dump_printf (MSG_NOTE, "\n");
515 }
516
517 /* Fail to vectorize statements marked as unvectorizable. */
518 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
519 {
520 if (dump_enabled_p ())
521 {
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "Build SLP failed: unvectorizable statement ");
524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
525 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
526 }
527 /* Fatal mismatch. */
528 matches[0] = false;
529 return false;
530 }
531
532 lhs = gimple_get_lhs (stmt);
533 if (lhs == NULL_TREE)
534 {
535 if (dump_enabled_p ())
536 {
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "Build SLP failed: not GIMPLE_ASSIGN nor "
539 "GIMPLE_CALL ");
540 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
542 }
543 /* Fatal mismatch. */
544 matches[0] = false;
545 return false;
546 }
547
548 if (is_gimple_assign (stmt)
549 && gimple_assign_rhs_code (stmt) == COND_EXPR
550 && (cond = gimple_assign_rhs1 (stmt))
551 && !COMPARISON_CLASS_P (cond))
552 {
553 if (dump_enabled_p ())
554 {
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
556 "Build SLP failed: condition is not "
557 "comparison ");
558 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
559 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
560 }
561 /* Fatal mismatch. */
562 matches[0] = false;
563 return false;
564 }
565
566 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
567 vectype = get_vectype_for_scalar_type (scalar_type);
568 if (!vectype)
569 {
570 if (dump_enabled_p ())
571 {
572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
573 "Build SLP failed: unsupported data-type ");
574 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
575 scalar_type);
576 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
577 }
578 /* Fatal mismatch. */
579 matches[0] = false;
580 return false;
581 }
582
583 /* If populating the vector type requires unrolling then fail
584 before adjusting *max_nunits for basic-block vectorization. */
585 if (bb_vinfo
586 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
587 {
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Build SLP failed: unrolling required "
590 "in basic block SLP\n");
591 /* Fatal mismatch. */
592 matches[0] = false;
593 return false;
594 }
595
596 /* In case of multiple types we need to detect the smallest type. */
597 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
598 {
599 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
600 if (bb_vinfo)
601 vectorization_factor = *max_nunits;
602 }
603
604 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
605 {
606 rhs_code = CALL_EXPR;
607 if (gimple_call_internal_p (call_stmt)
608 || gimple_call_tail_p (call_stmt)
609 || gimple_call_noreturn_p (call_stmt)
610 || !gimple_call_nothrow_p (call_stmt)
611 || gimple_call_chain (call_stmt))
612 {
613 if (dump_enabled_p ())
614 {
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: unsupported call type ");
617 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
618 call_stmt, 0);
619 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
620 }
621 /* Fatal mismatch. */
622 matches[0] = false;
623 return false;
624 }
625 }
626 else
627 rhs_code = gimple_assign_rhs_code (stmt);
628
629 /* Check the operation. */
630 if (i == 0)
631 {
632 first_stmt_code = rhs_code;
633
634 /* Shift arguments should be equal in all the packed stmts for a
635 vector shift with scalar shift operand. */
636 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
637 || rhs_code == LROTATE_EXPR
638 || rhs_code == RROTATE_EXPR)
639 {
640 vec_mode = TYPE_MODE (vectype);
641
642 /* First see if we have a vector/vector shift. */
643 optab = optab_for_tree_code (rhs_code, vectype,
644 optab_vector);
645
646 if (!optab
647 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
648 {
649 /* No vector/vector shift, try for a vector/scalar shift. */
650 optab = optab_for_tree_code (rhs_code, vectype,
651 optab_scalar);
652
653 if (!optab)
654 {
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
657 "Build SLP failed: no optab.\n");
658 /* Fatal mismatch. */
659 matches[0] = false;
660 return false;
661 }
662 icode = (int) optab_handler (optab, vec_mode);
663 if (icode == CODE_FOR_nothing)
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: "
668 "op not supported by target.\n");
669 /* Fatal mismatch. */
670 matches[0] = false;
671 return false;
672 }
673 optab_op2_mode = insn_data[icode].operand[2].mode;
674 if (!VECTOR_MODE_P (optab_op2_mode))
675 {
676 need_same_oprnds = true;
677 first_op1 = gimple_assign_rhs2 (stmt);
678 }
679 }
680 }
681 else if (rhs_code == WIDEN_LSHIFT_EXPR)
682 {
683 need_same_oprnds = true;
684 first_op1 = gimple_assign_rhs2 (stmt);
685 }
686 }
687 else
688 {
689 if (first_stmt_code != rhs_code
690 && alt_stmt_code == ERROR_MARK)
691 alt_stmt_code = rhs_code;
692 if (first_stmt_code != rhs_code
693 && (first_stmt_code != IMAGPART_EXPR
694 || rhs_code != REALPART_EXPR)
695 && (first_stmt_code != REALPART_EXPR
696 || rhs_code != IMAGPART_EXPR)
697 /* Handle mismatches in plus/minus by computing both
698 and merging the results. */
699 && !((first_stmt_code == PLUS_EXPR
700 || first_stmt_code == MINUS_EXPR)
701 && (alt_stmt_code == PLUS_EXPR
702 || alt_stmt_code == MINUS_EXPR)
703 && rhs_code == alt_stmt_code)
704 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
705 && (first_stmt_code == ARRAY_REF
706 || first_stmt_code == BIT_FIELD_REF
707 || first_stmt_code == INDIRECT_REF
708 || first_stmt_code == COMPONENT_REF
709 || first_stmt_code == MEM_REF)))
710 {
711 if (dump_enabled_p ())
712 {
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: different operation "
715 "in stmt ");
716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "original stmt ");
719 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
720 first_stmt, 0);
721 }
722 /* Mismatch. */
723 continue;
724 }
725
726 if (need_same_oprnds
727 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
728 {
729 if (dump_enabled_p ())
730 {
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732 "Build SLP failed: different shift "
733 "arguments in ");
734 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
735 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
736 }
737 /* Mismatch. */
738 continue;
739 }
740
741 if (rhs_code == CALL_EXPR)
742 {
743 gimple first_stmt = stmts[0];
744 if (gimple_call_num_args (stmt) != nops
745 || !operand_equal_p (gimple_call_fn (first_stmt),
746 gimple_call_fn (stmt), 0)
747 || gimple_call_fntype (first_stmt)
748 != gimple_call_fntype (stmt))
749 {
750 if (dump_enabled_p ())
751 {
752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
753 "Build SLP failed: different calls in ");
754 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
755 stmt, 0);
756 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
757 }
758 /* Mismatch. */
759 continue;
760 }
761 }
762 }
763
764 /* Grouped store or load. */
765 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
766 {
767 if (REFERENCE_CLASS_P (lhs))
768 {
769 /* Store. */
770 ;
771 }
772 else
773 {
774 /* Load. */
775 unsigned unrolling_factor
776 = least_common_multiple
777 (*max_nunits, group_size) / group_size;
778 /* FORNOW: Check that there is no gap between the loads
779 and no gap between the groups when we need to load
780 multiple groups at once.
781 ??? We should enhance this to only disallow gaps
782 inside vectors. */
783 if ((unrolling_factor > 1
784 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
785 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
786 /* If the group is split up then GROUP_GAP
787 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
788 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
789 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
790 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
791 {
792 if (dump_enabled_p ())
793 {
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "Build SLP failed: grouped "
796 "loads have gaps ");
797 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
798 stmt, 0);
799 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
800 }
801 /* Fatal mismatch. */
802 matches[0] = false;
803 return false;
804 }
805
806 /* Check that the size of interleaved loads group is not
807 greater than the SLP group size. */
808 unsigned ncopies
809 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
810 if (loop_vinfo
811 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
812 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
813 - GROUP_GAP (vinfo_for_stmt (stmt)))
814 > ncopies * group_size))
815 {
816 if (dump_enabled_p ())
817 {
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: the number "
820 "of interleaved loads is greater than "
821 "the SLP group size ");
822 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
823 stmt, 0);
824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
825 }
826 /* Fatal mismatch. */
827 matches[0] = false;
828 return false;
829 }
830
831 old_first_load = first_load;
832 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
833 if (prev_first_load)
834 {
835 /* Check that there are no loads from different interleaving
836 chains in the same node. */
837 if (prev_first_load != first_load)
838 {
839 if (dump_enabled_p ())
840 {
841 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
842 vect_location,
843 "Build SLP failed: different "
844 "interleaving chains in one node ");
845 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
846 stmt, 0);
847 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
848 }
849 /* Mismatch. */
850 continue;
851 }
852 }
853 else
854 prev_first_load = first_load;
855
856 /* In some cases a group of loads is just the same load
857 repeated N times. Only analyze its cost once. */
858 if (first_load == stmt && old_first_load != first_load)
859 {
860 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
861 if (vect_supportable_dr_alignment (first_dr, false)
862 == dr_unaligned_unsupported)
863 {
864 if (dump_enabled_p ())
865 {
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
867 vect_location,
868 "Build SLP failed: unsupported "
869 "unaligned load ");
870 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
871 stmt, 0);
872 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
873 }
874 /* Fatal mismatch. */
875 matches[0] = false;
876 return false;
877 }
878 }
879 }
880 } /* Grouped access. */
881 else
882 {
883 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
884 {
885 /* Not grouped load. */
886 if (dump_enabled_p ())
887 {
888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 "Build SLP failed: not grouped load ");
890 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
891 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
892 }
893
894 /* FORNOW: Not grouped loads are not supported. */
895 /* Fatal mismatch. */
896 matches[0] = false;
897 return false;
898 }
899
900 /* Not memory operation. */
901 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
902 && TREE_CODE_CLASS (rhs_code) != tcc_unary
903 && TREE_CODE_CLASS (rhs_code) != tcc_expression
904 && rhs_code != CALL_EXPR)
905 {
906 if (dump_enabled_p ())
907 {
908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 "Build SLP failed: operation");
910 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
911 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
912 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
913 }
914 /* Fatal mismatch. */
915 matches[0] = false;
916 return false;
917 }
918
919 if (rhs_code == COND_EXPR)
920 {
921 tree cond_expr = gimple_assign_rhs1 (stmt);
922
923 if (i == 0)
924 first_cond_code = TREE_CODE (cond_expr);
925 else if (first_cond_code != TREE_CODE (cond_expr))
926 {
927 if (dump_enabled_p ())
928 {
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "Build SLP failed: different"
931 " operation");
932 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
933 stmt, 0);
934 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
935 }
936 /* Mismatch. */
937 continue;
938 }
939 }
940 }
941
942 matches[i] = true;
943 }
944
945 for (i = 0; i < group_size; ++i)
946 if (!matches[i])
947 return false;
948
949 /* If we allowed a two-operation SLP node verify the target can cope
950 with the permute we are going to use. */
951 if (alt_stmt_code != ERROR_MARK
952 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
953 {
954 unsigned char *sel
955 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
956 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
957 {
958 sel[i] = i;
959 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
960 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
961 }
962 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
963 {
964 for (i = 0; i < group_size; ++i)
965 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
966 {
967 matches[i] = false;
968 if (dump_enabled_p ())
969 {
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
971 "Build SLP failed: different operation "
972 "in stmt ");
973 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
974 stmts[i], 0);
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "original stmt ");
977 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
978 first_stmt, 0);
979 }
980 }
981 return false;
982 }
983 *two_operators = true;
984 }
985
986 return true;
987 }
988
989 /* Recursively build an SLP tree starting from NODE.
990 Fail (and return a value not equal to zero) if def-stmts are not
991 isomorphic, require data permutation or are of unsupported types of
992 operation. Otherwise, return 0.
993 The value returned is the depth in the SLP tree where a mismatch
994 was found. */
995
996 static bool
997 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
998 slp_tree *node, unsigned int group_size,
999 unsigned int *max_nunits,
1000 vec<slp_tree> *loads,
1001 unsigned int vectorization_factor,
1002 bool *matches, unsigned *npermutes, unsigned *tree_size,
1003 unsigned max_tree_size)
1004 {
1005 unsigned nops, i, this_tree_size = 0;
1006 gimple stmt;
1007
1008 matches[0] = false;
1009
1010 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1011 if (is_gimple_call (stmt))
1012 nops = gimple_call_num_args (stmt);
1013 else if (is_gimple_assign (stmt))
1014 {
1015 nops = gimple_num_ops (stmt) - 1;
1016 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1017 nops++;
1018 }
1019 else
1020 return false;
1021
1022 bool two_operators = false;
1023 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
1024 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
1025 max_nunits, vectorization_factor, matches,
1026 &two_operators))
1027 return false;
1028 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
1029
1030 /* If the SLP node is a load, terminate the recursion. */
1031 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1032 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1033 {
1034 loads->safe_push (*node);
1035 return true;
1036 }
1037
1038 /* Get at the operands, verifying they are compatible. */
1039 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1040 slp_oprnd_info oprnd_info;
1041 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1042 {
1043 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1044 stmt, i, &oprnds_info))
1045 {
1046 case 0:
1047 break;
1048 case -1:
1049 matches[0] = false;
1050 vect_free_oprnd_info (oprnds_info);
1051 return false;
1052 case 1:
1053 matches[i] = false;
1054 break;
1055 }
1056 }
1057 for (i = 0; i < group_size; ++i)
1058 if (!matches[i])
1059 {
1060 vect_free_oprnd_info (oprnds_info);
1061 return false;
1062 }
1063
1064 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1065
1066 /* Create SLP_TREE nodes for the definition node/s. */
1067 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1068 {
1069 slp_tree child;
1070 unsigned old_nloads = loads->length ();
1071 unsigned old_max_nunits = *max_nunits;
1072
1073 if (oprnd_info->first_dt != vect_internal_def)
1074 continue;
1075
1076 if (++this_tree_size > max_tree_size)
1077 {
1078 vect_free_oprnd_info (oprnds_info);
1079 return false;
1080 }
1081
1082 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1083 if (!child)
1084 {
1085 vect_free_oprnd_info (oprnds_info);
1086 return false;
1087 }
1088
1089 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1090 group_size, max_nunits, loads,
1091 vectorization_factor, matches,
1092 npermutes, &this_tree_size, max_tree_size))
1093 {
1094 /* If we have all children of child built up from scalars then just
1095 throw that away and build it up this node from scalars. */
1096 if (!SLP_TREE_CHILDREN (child).is_empty ())
1097 {
1098 unsigned int j;
1099 slp_tree grandchild;
1100
1101 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1102 if (grandchild != NULL)
1103 break;
1104 if (!grandchild)
1105 {
1106 /* Roll back. */
1107 *max_nunits = old_max_nunits;
1108 loads->truncate (old_nloads);
1109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1110 vect_free_slp_tree (grandchild);
1111 SLP_TREE_CHILDREN (child).truncate (0);
1112
1113 dump_printf_loc (MSG_NOTE, vect_location,
1114 "Building parent vector operands from "
1115 "scalars instead\n");
1116 oprnd_info->def_stmts = vNULL;
1117 vect_free_slp_tree (child);
1118 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1119 continue;
1120 }
1121 }
1122
1123 oprnd_info->def_stmts = vNULL;
1124 SLP_TREE_CHILDREN (*node).quick_push (child);
1125 continue;
1126 }
1127
1128 /* If the SLP build failed fatally and we analyze a basic-block
1129 simply treat nodes we fail to build as externally defined
1130 (and thus build vectors from the scalar defs).
1131 The cost model will reject outright expensive cases.
1132 ??? This doesn't treat cases where permutation ultimatively
1133 fails (or we don't try permutation below). Ideally we'd
1134 even compute a permutation that will end up with the maximum
1135 SLP tree size... */
1136 if (bb_vinfo
1137 && !matches[0]
1138 /* ??? Rejecting patterns this way doesn't work. We'd have to
1139 do extra work to cancel the pattern so the uses see the
1140 scalar version. */
1141 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1142 {
1143 unsigned int j;
1144 slp_tree grandchild;
1145
1146 /* Roll back. */
1147 *max_nunits = old_max_nunits;
1148 loads->truncate (old_nloads);
1149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1150 vect_free_slp_tree (grandchild);
1151 SLP_TREE_CHILDREN (child).truncate (0);
1152
1153 dump_printf_loc (MSG_NOTE, vect_location,
1154 "Building vector operands from scalars\n");
1155 oprnd_info->def_stmts = vNULL;
1156 vect_free_slp_tree (child);
1157 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1158 continue;
1159 }
1160
1161 /* If the SLP build for operand zero failed and operand zero
1162 and one can be commutated try that for the scalar stmts
1163 that failed the match. */
1164 if (i == 0
1165 /* A first scalar stmt mismatch signals a fatal mismatch. */
1166 && matches[0]
1167 /* ??? For COND_EXPRs we can swap the comparison operands
1168 as well as the arms under some constraints. */
1169 && nops == 2
1170 && oprnds_info[1]->first_dt == vect_internal_def
1171 && is_gimple_assign (stmt)
1172 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1173 && !SLP_TREE_TWO_OPERATORS (*node)
1174 /* Do so only if the number of not successful permutes was nor more
1175 than a cut-ff as re-trying the recursive match on
1176 possibly each level of the tree would expose exponential
1177 behavior. */
1178 && *npermutes < 4)
1179 {
1180 unsigned int j;
1181 slp_tree grandchild;
1182
1183 /* Roll back. */
1184 *max_nunits = old_max_nunits;
1185 loads->truncate (old_nloads);
1186 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1187 vect_free_slp_tree (grandchild);
1188 SLP_TREE_CHILDREN (child).truncate (0);
1189
1190 /* Swap mismatched definition stmts. */
1191 dump_printf_loc (MSG_NOTE, vect_location,
1192 "Re-trying with swapped operands of stmts ");
1193 for (j = 0; j < group_size; ++j)
1194 if (!matches[j])
1195 {
1196 gimple tem = oprnds_info[0]->def_stmts[j];
1197 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1198 oprnds_info[1]->def_stmts[j] = tem;
1199 dump_printf (MSG_NOTE, "%d ", j);
1200 }
1201 dump_printf (MSG_NOTE, "\n");
1202 /* And try again with scratch 'matches' ... */
1203 bool *tem = XALLOCAVEC (bool, group_size);
1204 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1205 group_size, max_nunits, loads,
1206 vectorization_factor,
1207 tem, npermutes, &this_tree_size,
1208 max_tree_size))
1209 {
1210 /* ... so if successful we can apply the operand swapping
1211 to the GIMPLE IL. This is necessary because for example
1212 vect_get_slp_defs uses operand indexes and thus expects
1213 canonical operand order. */
1214 for (j = 0; j < group_size; ++j)
1215 if (!matches[j])
1216 {
1217 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1218 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1219 gimple_assign_rhs2_ptr (stmt));
1220 }
1221 oprnd_info->def_stmts = vNULL;
1222 SLP_TREE_CHILDREN (*node).quick_push (child);
1223 continue;
1224 }
1225
1226 ++*npermutes;
1227 }
1228
1229 oprnd_info->def_stmts = vNULL;
1230 vect_free_slp_tree (child);
1231 vect_free_oprnd_info (oprnds_info);
1232 return false;
1233 }
1234
1235 if (tree_size)
1236 *tree_size += this_tree_size;
1237
1238 vect_free_oprnd_info (oprnds_info);
1239 return true;
1240 }
1241
1242 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1243
1244 static void
1245 vect_print_slp_tree (int dump_kind, slp_tree node)
1246 {
1247 int i;
1248 gimple stmt;
1249 slp_tree child;
1250
1251 if (!node)
1252 return;
1253
1254 dump_printf (dump_kind, "node ");
1255 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1256 {
1257 dump_printf (dump_kind, "\n\tstmt %d ", i);
1258 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1259 }
1260 dump_printf (dump_kind, "\n");
1261
1262 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1263 vect_print_slp_tree (dump_kind, child);
1264 }
1265
1266
1267 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1268 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1269 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1270 stmts in NODE are to be marked. */
1271
1272 static void
1273 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1274 {
1275 int i;
1276 gimple stmt;
1277 slp_tree child;
1278
1279 if (!node)
1280 return;
1281
1282 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1283 if (j < 0 || i == j)
1284 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1285
1286 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1287 vect_mark_slp_stmts (child, mark, j);
1288 }
1289
1290
1291 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1292
1293 static void
1294 vect_mark_slp_stmts_relevant (slp_tree node)
1295 {
1296 int i;
1297 gimple stmt;
1298 stmt_vec_info stmt_info;
1299 slp_tree child;
1300
1301 if (!node)
1302 return;
1303
1304 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1305 {
1306 stmt_info = vinfo_for_stmt (stmt);
1307 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1308 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1309 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1310 }
1311
1312 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1313 vect_mark_slp_stmts_relevant (child);
1314 }
1315
1316
1317 /* Rearrange the statements of NODE according to PERMUTATION. */
1318
1319 static void
1320 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1321 vec<unsigned> permutation)
1322 {
1323 gimple stmt;
1324 vec<gimple> tmp_stmts;
1325 unsigned int i;
1326 slp_tree child;
1327
1328 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1329 vect_slp_rearrange_stmts (child, group_size, permutation);
1330
1331 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1332 tmp_stmts.create (group_size);
1333 tmp_stmts.quick_grow_cleared (group_size);
1334
1335 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1336 tmp_stmts[permutation[i]] = stmt;
1337
1338 SLP_TREE_SCALAR_STMTS (node).release ();
1339 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1340 }
1341
1342
1343 /* Check if the required load permutations in the SLP instance
1344 SLP_INSTN are supported. */
1345
1346 static bool
1347 vect_supported_load_permutation_p (slp_instance slp_instn)
1348 {
1349 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1350 unsigned int i, j, k, next;
1351 sbitmap load_index;
1352 slp_tree node;
1353 gimple stmt, load, next_load, first_load;
1354 struct data_reference *dr;
1355
1356 if (dump_enabled_p ())
1357 {
1358 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1359 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1360 if (node->load_permutation.exists ())
1361 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1362 dump_printf (MSG_NOTE, "%d ", next);
1363 else
1364 for (k = 0; k < group_size; ++k)
1365 dump_printf (MSG_NOTE, "%d ", k);
1366 dump_printf (MSG_NOTE, "\n");
1367 }
1368
1369 /* In case of reduction every load permutation is allowed, since the order
1370 of the reduction statements is not important (as opposed to the case of
1371 grouped stores). The only condition we need to check is that all the
1372 load nodes are of the same size and have the same permutation (and then
1373 rearrange all the nodes of the SLP instance according to this
1374 permutation). */
1375
1376 /* Check that all the load nodes are of the same size. */
1377 /* ??? Can't we assert this? */
1378 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1379 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1380 return false;
1381
1382 node = SLP_INSTANCE_TREE (slp_instn);
1383 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1384
1385 /* Reduction (there are no data-refs in the root).
1386 In reduction chain the order of the loads is important. */
1387 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1388 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1389 {
1390 slp_tree load;
1391 unsigned int lidx;
1392
1393 /* Compare all the permutation sequences to the first one. We know
1394 that at least one load is permuted. */
1395 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1396 if (!node->load_permutation.exists ())
1397 return false;
1398 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1399 {
1400 if (!load->load_permutation.exists ())
1401 return false;
1402 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1403 if (lidx != node->load_permutation[j])
1404 return false;
1405 }
1406
1407 /* Check that the loads in the first sequence are different and there
1408 are no gaps between them. */
1409 load_index = sbitmap_alloc (group_size);
1410 bitmap_clear (load_index);
1411 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1412 {
1413 if (bitmap_bit_p (load_index, lidx))
1414 {
1415 sbitmap_free (load_index);
1416 return false;
1417 }
1418 bitmap_set_bit (load_index, lidx);
1419 }
1420 for (i = 0; i < group_size; i++)
1421 if (!bitmap_bit_p (load_index, i))
1422 {
1423 sbitmap_free (load_index);
1424 return false;
1425 }
1426 sbitmap_free (load_index);
1427
1428 /* This permutation is valid for reduction. Since the order of the
1429 statements in the nodes is not important unless they are memory
1430 accesses, we can rearrange the statements in all the nodes
1431 according to the order of the loads. */
1432 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1433 node->load_permutation);
1434
1435 /* We are done, no actual permutations need to be generated. */
1436 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1437 SLP_TREE_LOAD_PERMUTATION (node).release ();
1438 return true;
1439 }
1440
1441 /* In basic block vectorization we allow any subchain of an interleaving
1442 chain.
1443 FORNOW: not supported in loop SLP because of realignment compications. */
1444 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1445 {
1446 /* Check whether the loads in an instance form a subchain and thus
1447 no permutation is necessary. */
1448 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1449 {
1450 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1451 continue;
1452 bool subchain_p = true;
1453 next_load = NULL;
1454 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1455 {
1456 if (j != 0 && next_load != load)
1457 {
1458 subchain_p = false;
1459 break;
1460 }
1461 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1462 }
1463 if (subchain_p)
1464 SLP_TREE_LOAD_PERMUTATION (node).release ();
1465 else
1466 {
1467 /* Verify the permutation can be generated. */
1468 vec<tree> tem;
1469 if (!vect_transform_slp_perm_load (node, tem, NULL,
1470 1, slp_instn, true))
1471 {
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1473 vect_location,
1474 "unsupported load permutation\n");
1475 return false;
1476 }
1477 }
1478 }
1479
1480 /* Check that the alignment of the first load in every subchain, i.e.,
1481 the first statement in every load node, is supported.
1482 ??? This belongs in alignment checking. */
1483 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1484 {
1485 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1486 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1487 {
1488 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1489 if (vect_supportable_dr_alignment (dr, false)
1490 == dr_unaligned_unsupported)
1491 {
1492 if (dump_enabled_p ())
1493 {
1494 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1495 vect_location,
1496 "unsupported unaligned load ");
1497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1498 first_load, 0);
1499 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1500 }
1501 return false;
1502 }
1503 }
1504 }
1505
1506 return true;
1507 }
1508
1509 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1510 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1511 well (unless it's reduction). */
1512 if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1513 return false;
1514 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1515 if (!node->load_permutation.exists ())
1516 return false;
1517
1518 load_index = sbitmap_alloc (group_size);
1519 bitmap_clear (load_index);
1520 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1521 {
1522 unsigned int lidx = node->load_permutation[0];
1523 if (bitmap_bit_p (load_index, lidx))
1524 {
1525 sbitmap_free (load_index);
1526 return false;
1527 }
1528 bitmap_set_bit (load_index, lidx);
1529 FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1530 if (k != lidx)
1531 {
1532 sbitmap_free (load_index);
1533 return false;
1534 }
1535 }
1536 for (i = 0; i < group_size; i++)
1537 if (!bitmap_bit_p (load_index, i))
1538 {
1539 sbitmap_free (load_index);
1540 return false;
1541 }
1542 sbitmap_free (load_index);
1543
1544 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1545 if (node->load_permutation.exists ()
1546 && !vect_transform_slp_perm_load
1547 (node, vNULL, NULL,
1548 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1549 return false;
1550 return true;
1551 }
1552
1553
1554 /* Find the last store in SLP INSTANCE. */
1555
1556 static gimple
1557 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1558 {
1559 gimple last = NULL, stmt;
1560
1561 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1562 {
1563 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1564 if (is_pattern_stmt_p (stmt_vinfo))
1565 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1566 else
1567 last = get_later_stmt (stmt, last);
1568 }
1569
1570 return last;
1571 }
1572
1573 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1574
1575 static void
1576 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1577 stmt_vector_for_cost *prologue_cost_vec,
1578 stmt_vector_for_cost *body_cost_vec,
1579 unsigned ncopies_for_cost)
1580 {
1581 unsigned i;
1582 slp_tree child;
1583 gimple stmt, s;
1584 stmt_vec_info stmt_info;
1585 tree lhs;
1586 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1587
1588 /* Recurse down the SLP tree. */
1589 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1590 if (child)
1591 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1592 body_cost_vec, ncopies_for_cost);
1593
1594 /* Look at the first scalar stmt to determine the cost. */
1595 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1596 stmt_info = vinfo_for_stmt (stmt);
1597 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1598 {
1599 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1600 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1601 vect_uninitialized_def,
1602 node, prologue_cost_vec, body_cost_vec);
1603 else
1604 {
1605 int i;
1606 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1607 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1608 node, prologue_cost_vec, body_cost_vec);
1609 /* If the load is permuted record the cost for the permutation.
1610 ??? Loads from multiple chains are let through here only
1611 for a single special case involving complex numbers where
1612 in the end no permutation is necessary. */
1613 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1614 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1615 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1616 && vect_get_place_in_interleaving_chain
1617 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1618 {
1619 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1620 stmt_info, 0, vect_body);
1621 break;
1622 }
1623 }
1624 }
1625 else
1626 {
1627 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1628 stmt_info, 0, vect_body);
1629 if (SLP_TREE_TWO_OPERATORS (node))
1630 {
1631 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1632 stmt_info, 0, vect_body);
1633 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1634 stmt_info, 0, vect_body);
1635 }
1636 }
1637
1638 /* Scan operands and account for prologue cost of constants/externals.
1639 ??? This over-estimates cost for multiple uses and should be
1640 re-engineered. */
1641 lhs = gimple_get_lhs (stmt);
1642 for (i = 0; i < gimple_num_ops (stmt); ++i)
1643 {
1644 tree def, op = gimple_op (stmt, i);
1645 gimple def_stmt;
1646 enum vect_def_type dt;
1647 if (!op || op == lhs)
1648 continue;
1649 if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
1650 STMT_VINFO_BB_VINFO (stmt_info),
1651 &def_stmt, &def, &dt))
1652 {
1653 /* Without looking at the actual initializer a vector of
1654 constants can be implemented as load from the constant pool.
1655 ??? We need to pass down stmt_info for a vector type
1656 even if it points to the wrong stmt. */
1657 if (dt == vect_constant_def)
1658 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1659 stmt_info, 0, vect_prologue);
1660 else if (dt == vect_external_def)
1661 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1662 stmt_info, 0, vect_prologue);
1663 }
1664 }
1665 }
1666
1667 /* Compute the cost for the SLP instance INSTANCE. */
1668
1669 static void
1670 vect_analyze_slp_cost (slp_instance instance, void *data)
1671 {
1672 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1673 unsigned ncopies_for_cost;
1674 stmt_info_for_cost *si;
1675 unsigned i;
1676
1677 /* Calculate the number of vector stmts to create based on the unrolling
1678 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1679 GROUP_SIZE / NUNITS otherwise. */
1680 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1681 slp_tree node = SLP_INSTANCE_TREE (instance);
1682 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1683 /* Adjust the group_size by the vectorization factor which is always one
1684 for basic-block vectorization. */
1685 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1686 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1687 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1688 /* For reductions look at a reduction operand in case the reduction
1689 operation is widening like DOT_PROD or SAD. */
1690 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1691 {
1692 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1693 switch (gimple_assign_rhs_code (stmt))
1694 {
1695 case DOT_PROD_EXPR:
1696 case SAD_EXPR:
1697 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1698 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1699 break;
1700 default:;
1701 }
1702 }
1703 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1704
1705 prologue_cost_vec.create (10);
1706 body_cost_vec.create (10);
1707 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1708 &prologue_cost_vec, &body_cost_vec,
1709 ncopies_for_cost);
1710
1711 /* Record the prologue costs, which were delayed until we were
1712 sure that SLP was successful. */
1713 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1714 {
1715 struct _stmt_vec_info *stmt_info
1716 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1717 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1718 si->misalign, vect_prologue);
1719 }
1720
1721 /* Record the instance's instructions in the target cost model. */
1722 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1723 {
1724 struct _stmt_vec_info *stmt_info
1725 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1726 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1727 si->misalign, vect_body);
1728 }
1729
1730 prologue_cost_vec.release ();
1731 body_cost_vec.release ();
1732 }
1733
1734 /* Analyze an SLP instance starting from a group of grouped stores. Call
1735 vect_build_slp_tree to build a tree of packed stmts if possible.
1736 Return FALSE if it's impossible to SLP any stmt in the loop. */
1737
1738 static bool
1739 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1740 gimple stmt, unsigned max_tree_size)
1741 {
1742 slp_instance new_instance;
1743 slp_tree node;
1744 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1745 unsigned int unrolling_factor = 1, nunits;
1746 tree vectype, scalar_type = NULL_TREE;
1747 gimple next;
1748 unsigned int vectorization_factor = 0;
1749 int i;
1750 unsigned int max_nunits = 0;
1751 vec<slp_tree> loads;
1752 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1753 vec<gimple> scalar_stmts;
1754
1755 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1756 {
1757 if (dr)
1758 {
1759 scalar_type = TREE_TYPE (DR_REF (dr));
1760 vectype = get_vectype_for_scalar_type (scalar_type);
1761 }
1762 else
1763 {
1764 gcc_assert (loop_vinfo);
1765 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1766 }
1767
1768 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1769 }
1770 else
1771 {
1772 gcc_assert (loop_vinfo);
1773 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1774 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1775 }
1776
1777 if (!vectype)
1778 {
1779 if (dump_enabled_p ())
1780 {
1781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 "Build SLP failed: unsupported data-type ");
1783 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1784 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1785 }
1786
1787 return false;
1788 }
1789
1790 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1791 if (loop_vinfo)
1792 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1793 else
1794 vectorization_factor = nunits;
1795
1796 /* Calculate the unrolling factor. */
1797 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1798 if (unrolling_factor != 1 && !loop_vinfo)
1799 {
1800 if (dump_enabled_p ())
1801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1802 "Build SLP failed: unrolling required in basic"
1803 " block SLP\n");
1804
1805 return false;
1806 }
1807
1808 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1809 scalar_stmts.create (group_size);
1810 next = stmt;
1811 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1812 {
1813 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1814 while (next)
1815 {
1816 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1817 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1818 scalar_stmts.safe_push (
1819 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1820 else
1821 scalar_stmts.safe_push (next);
1822 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1823 }
1824 /* Mark the first element of the reduction chain as reduction to properly
1825 transform the node. In the reduction analysis phase only the last
1826 element of the chain is marked as reduction. */
1827 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1828 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1829 }
1830 else
1831 {
1832 /* Collect reduction statements. */
1833 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1834 for (i = 0; reductions.iterate (i, &next); i++)
1835 scalar_stmts.safe_push (next);
1836 }
1837
1838 node = vect_create_new_slp_node (scalar_stmts);
1839
1840 loads.create (group_size);
1841
1842 /* Build the tree for the SLP instance. */
1843 bool *matches = XALLOCAVEC (bool, group_size);
1844 unsigned npermutes = 0;
1845 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1846 &max_nunits, &loads,
1847 vectorization_factor, matches, &npermutes, NULL,
1848 max_tree_size))
1849 {
1850 /* Calculate the unrolling factor based on the smallest type. */
1851 if (max_nunits > nunits)
1852 unrolling_factor = least_common_multiple (max_nunits, group_size)
1853 / group_size;
1854
1855 if (unrolling_factor != 1 && !loop_vinfo)
1856 {
1857 if (dump_enabled_p ())
1858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1859 "Build SLP failed: unrolling required in basic"
1860 " block SLP\n");
1861 vect_free_slp_tree (node);
1862 loads.release ();
1863 return false;
1864 }
1865
1866 /* Create a new SLP instance. */
1867 new_instance = XNEW (struct _slp_instance);
1868 SLP_INSTANCE_TREE (new_instance) = node;
1869 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1870 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1871 SLP_INSTANCE_LOADS (new_instance) = loads;
1872
1873 /* Compute the load permutation. */
1874 slp_tree load_node;
1875 bool loads_permuted = false;
1876 FOR_EACH_VEC_ELT (loads, i, load_node)
1877 {
1878 vec<unsigned> load_permutation;
1879 int j;
1880 gimple load, first_stmt;
1881 bool this_load_permuted = false;
1882 load_permutation.create (group_size);
1883 first_stmt = GROUP_FIRST_ELEMENT
1884 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1885 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1886 {
1887 int load_place
1888 = vect_get_place_in_interleaving_chain (load, first_stmt);
1889 gcc_assert (load_place != -1);
1890 if (load_place != j)
1891 this_load_permuted = true;
1892 load_permutation.safe_push (load_place);
1893 }
1894 if (!this_load_permuted)
1895 {
1896 load_permutation.release ();
1897 continue;
1898 }
1899 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1900 loads_permuted = true;
1901 }
1902
1903 if (loads_permuted)
1904 {
1905 if (!vect_supported_load_permutation_p (new_instance))
1906 {
1907 if (dump_enabled_p ())
1908 {
1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 "Build SLP failed: unsupported load "
1911 "permutation ");
1912 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1913 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1914 }
1915 vect_free_slp_instance (new_instance);
1916 return false;
1917 }
1918 }
1919
1920
1921 if (loop_vinfo)
1922 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1923 else
1924 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1925
1926 if (dump_enabled_p ())
1927 vect_print_slp_tree (MSG_NOTE, node);
1928
1929 return true;
1930 }
1931
1932 /* Failed to SLP. */
1933 /* Free the allocated memory. */
1934 vect_free_slp_tree (node);
1935 loads.release ();
1936
1937 return false;
1938 }
1939
1940
1941 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1942 trees of packed scalar stmts if SLP is possible. */
1943
1944 bool
1945 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1946 unsigned max_tree_size)
1947 {
1948 unsigned int i;
1949 vec<gimple> grouped_stores;
1950 vec<gimple> reductions = vNULL;
1951 vec<gimple> reduc_chains = vNULL;
1952 gimple first_element;
1953 bool ok = false;
1954
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1957
1958 if (loop_vinfo)
1959 {
1960 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1961 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1962 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1963 }
1964 else
1965 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1966
1967 /* Find SLP sequences starting from groups of grouped stores. */
1968 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1969 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1970 max_tree_size))
1971 ok = true;
1972
1973 if (reduc_chains.length () > 0)
1974 {
1975 /* Find SLP sequences starting from reduction chains. */
1976 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1977 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1978 max_tree_size))
1979 ok = true;
1980 else
1981 return false;
1982
1983 /* Don't try to vectorize SLP reductions if reduction chain was
1984 detected. */
1985 return ok;
1986 }
1987
1988 /* Find SLP sequences starting from groups of reductions. */
1989 if (reductions.length () > 1
1990 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1991 max_tree_size))
1992 ok = true;
1993
1994 return true;
1995 }
1996
1997
1998 /* For each possible SLP instance decide whether to SLP it and calculate overall
1999 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2000 least one instance. */
2001
2002 bool
2003 vect_make_slp_decision (loop_vec_info loop_vinfo)
2004 {
2005 unsigned int i, unrolling_factor = 1;
2006 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2007 slp_instance instance;
2008 int decided_to_slp = 0;
2009
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
2012 "\n");
2013
2014 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2015 {
2016 /* FORNOW: SLP if you can. */
2017 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
2018 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
2019
2020 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2021 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2022 loop-based vectorization. Such stmts will be marked as HYBRID. */
2023 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2024 decided_to_slp++;
2025 }
2026
2027 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2028
2029 if (decided_to_slp && dump_enabled_p ())
2030 dump_printf_loc (MSG_NOTE, vect_location,
2031 "Decided to SLP %d instances. Unrolling factor %d\n",
2032 decided_to_slp, unrolling_factor);
2033
2034 return (decided_to_slp > 0);
2035 }
2036
2037
2038 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2039 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2040
2041 static void
2042 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2043 {
2044 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
2045 imm_use_iterator imm_iter;
2046 gimple use_stmt;
2047 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
2048 slp_tree child;
2049 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2050 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2051 int j;
2052
2053 /* Propagate hybrid down the SLP tree. */
2054 if (stype == hybrid)
2055 ;
2056 else if (HYBRID_SLP_STMT (stmt_vinfo))
2057 stype = hybrid;
2058 else
2059 {
2060 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2061 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2062 /* We always get the pattern stmt here, but for immediate
2063 uses we have to use the LHS of the original stmt. */
2064 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2065 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
2066 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
2067 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2068 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2069 {
2070 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2071 continue;
2072 use_vinfo = vinfo_for_stmt (use_stmt);
2073 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2074 && STMT_VINFO_RELATED_STMT (use_vinfo))
2075 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
2076 if (!STMT_SLP_TYPE (use_vinfo)
2077 && (STMT_VINFO_RELEVANT (use_vinfo)
2078 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2079 && !(gimple_code (use_stmt) == GIMPLE_PHI
2080 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2081 stype = hybrid;
2082 }
2083 }
2084
2085 if (stype == hybrid)
2086 {
2087 if (dump_enabled_p ())
2088 {
2089 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2090 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2091 }
2092 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2093 }
2094
2095 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2096 if (child)
2097 vect_detect_hybrid_slp_stmts (child, i, stype);
2098 }
2099
2100 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2101
2102 static tree
2103 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2104 {
2105 walk_stmt_info *wi = (walk_stmt_info *)data;
2106 struct loop *loopp = (struct loop *)wi->info;
2107
2108 if (wi->is_lhs)
2109 return NULL_TREE;
2110
2111 if (TREE_CODE (*tp) == SSA_NAME
2112 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2113 {
2114 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2115 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2116 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2117 {
2118 if (dump_enabled_p ())
2119 {
2120 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2121 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2122 }
2123 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2124 }
2125 }
2126
2127 return NULL_TREE;
2128 }
2129
2130 static tree
2131 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2132 walk_stmt_info *)
2133 {
2134 /* If the stmt is in a SLP instance then this isn't a reason
2135 to mark use definitions in other SLP instances as hybrid. */
2136 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2137 *handled = true;
2138 return NULL_TREE;
2139 }
2140
2141 /* Find stmts that must be both vectorized and SLPed. */
2142
2143 void
2144 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2145 {
2146 unsigned int i;
2147 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2148 slp_instance instance;
2149
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2152 "\n");
2153
2154 /* First walk all pattern stmt in the loop and mark defs of uses as
2155 hybrid because immediate uses in them are not recorded. */
2156 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2157 {
2158 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2159 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2160 gsi_next (&gsi))
2161 {
2162 gimple stmt = gsi_stmt (gsi);
2163 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2164 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2165 {
2166 walk_stmt_info wi;
2167 memset (&wi, 0, sizeof (wi));
2168 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2169 gimple_stmt_iterator gsi2
2170 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2171 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2172 vect_detect_hybrid_slp_1, &wi);
2173 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2174 vect_detect_hybrid_slp_2,
2175 vect_detect_hybrid_slp_1, &wi);
2176 }
2177 }
2178 }
2179
2180 /* Then walk the SLP instance trees marking stmts with uses in
2181 non-SLP stmts as hybrid, also propagating hybrid down the
2182 SLP tree, collecting the above info on-the-fly. */
2183 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2184 {
2185 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2186 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2187 i, pure_slp);
2188 }
2189 }
2190
2191
2192 /* Create and initialize a new bb_vec_info struct for BB, as well as
2193 stmt_vec_info structs for all the stmts in it. */
2194
2195 static bb_vec_info
2196 new_bb_vec_info (basic_block bb)
2197 {
2198 bb_vec_info res = NULL;
2199 gimple_stmt_iterator gsi;
2200
2201 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2202 BB_VINFO_BB (res) = bb;
2203
2204 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2205 {
2206 gimple stmt = gsi_stmt (gsi);
2207 gimple_set_uid (stmt, 0);
2208 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2209 }
2210
2211 BB_VINFO_GROUPED_STORES (res).create (10);
2212 BB_VINFO_SLP_INSTANCES (res).create (2);
2213 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2214
2215 bb->aux = res;
2216 return res;
2217 }
2218
2219
2220 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2221 stmts in the basic block. */
2222
2223 static void
2224 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2225 {
2226 vec<slp_instance> slp_instances;
2227 slp_instance instance;
2228 basic_block bb;
2229 gimple_stmt_iterator si;
2230 unsigned i;
2231
2232 if (!bb_vinfo)
2233 return;
2234
2235 bb = BB_VINFO_BB (bb_vinfo);
2236
2237 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2238 {
2239 gimple stmt = gsi_stmt (si);
2240 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2241
2242 if (stmt_info)
2243 /* Free stmt_vec_info. */
2244 free_stmt_vec_info (stmt);
2245 }
2246
2247 vect_destroy_datarefs (NULL, bb_vinfo);
2248 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2249 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2250 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2251 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2252 vect_free_slp_instance (instance);
2253 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2254 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2255 free (bb_vinfo);
2256 bb->aux = NULL;
2257 }
2258
2259
2260 /* Analyze statements contained in SLP tree node after recursively analyzing
2261 the subtree. Return TRUE if the operations are supported. */
2262
2263 static bool
2264 vect_slp_analyze_node_operations (slp_tree node)
2265 {
2266 bool dummy;
2267 int i;
2268 gimple stmt;
2269 slp_tree child;
2270
2271 if (!node)
2272 return true;
2273
2274 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2275 if (!vect_slp_analyze_node_operations (child))
2276 return false;
2277
2278 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2279 {
2280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2281 gcc_assert (stmt_info);
2282 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2283
2284 if (!vect_analyze_stmt (stmt, &dummy, node))
2285 return false;
2286 }
2287
2288 return true;
2289 }
2290
2291
2292 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2293 operations are supported. */
2294
2295 bool
2296 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2297 {
2298 slp_instance instance;
2299 int i;
2300
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "=== vect_slp_analyze_operations ===\n");
2304
2305 for (i = 0; slp_instances.iterate (i, &instance); )
2306 {
2307 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2308 {
2309 dump_printf_loc (MSG_NOTE, vect_location,
2310 "removing SLP instance operations starting from: ");
2311 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2312 SLP_TREE_SCALAR_STMTS
2313 (SLP_INSTANCE_TREE (instance))[0], 0);
2314 vect_free_slp_instance (instance);
2315 slp_instances.ordered_remove (i);
2316 }
2317 else
2318 {
2319 /* Compute the costs of the SLP instance. */
2320 vect_analyze_slp_cost (instance, data);
2321 i++;
2322 }
2323 }
2324
2325 if (!slp_instances.length ())
2326 return false;
2327
2328 return true;
2329 }
2330
2331
2332 /* Compute the scalar cost of the SLP node NODE and its children
2333 and return it. Do not account defs that are marked in LIFE and
2334 update LIFE according to uses of NODE. */
2335
2336 static unsigned
2337 vect_bb_slp_scalar_cost (basic_block bb,
2338 slp_tree node, vec<bool, va_heap> *life)
2339 {
2340 unsigned scalar_cost = 0;
2341 unsigned i;
2342 gimple stmt;
2343 slp_tree child;
2344
2345 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2346 {
2347 unsigned stmt_cost;
2348 ssa_op_iter op_iter;
2349 def_operand_p def_p;
2350 stmt_vec_info stmt_info;
2351
2352 if ((*life)[i])
2353 continue;
2354
2355 /* If there is a non-vectorized use of the defs then the scalar
2356 stmt is kept live in which case we do not account it or any
2357 required defs in the SLP children in the scalar cost. This
2358 way we make the vectorization more costly when compared to
2359 the scalar cost. */
2360 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2361 {
2362 imm_use_iterator use_iter;
2363 gimple use_stmt;
2364 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2365 if (!is_gimple_debug (use_stmt)
2366 && (gimple_code (use_stmt) == GIMPLE_PHI
2367 || gimple_bb (use_stmt) != bb
2368 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2369 {
2370 (*life)[i] = true;
2371 BREAK_FROM_IMM_USE_STMT (use_iter);
2372 }
2373 }
2374 if ((*life)[i])
2375 continue;
2376
2377 stmt_info = vinfo_for_stmt (stmt);
2378 if (STMT_VINFO_DATA_REF (stmt_info))
2379 {
2380 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2381 stmt_cost = vect_get_stmt_cost (scalar_load);
2382 else
2383 stmt_cost = vect_get_stmt_cost (scalar_store);
2384 }
2385 else
2386 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2387
2388 scalar_cost += stmt_cost;
2389 }
2390
2391 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2392 if (child)
2393 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2394
2395 return scalar_cost;
2396 }
2397
2398 /* Check if vectorization of the basic block is profitable. */
2399
2400 static bool
2401 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2402 {
2403 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2404 slp_instance instance;
2405 int i;
2406 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2407 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2408
2409 /* Calculate scalar cost. */
2410 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2411 {
2412 auto_vec<bool, 20> life;
2413 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2414 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2415 SLP_INSTANCE_TREE (instance),
2416 &life);
2417 }
2418
2419 /* Complete the target-specific cost calculation. */
2420 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2421 &vec_inside_cost, &vec_epilogue_cost);
2422
2423 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2424
2425 if (dump_enabled_p ())
2426 {
2427 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2428 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2429 vec_inside_cost);
2430 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2431 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2432 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2433 }
2434
2435 /* Vectorization is profitable if its cost is less than the cost of scalar
2436 version. */
2437 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2438 return false;
2439
2440 return true;
2441 }
2442
2443 /* Check if the basic block can be vectorized. */
2444
2445 static bb_vec_info
2446 vect_slp_analyze_bb_1 (basic_block bb)
2447 {
2448 bb_vec_info bb_vinfo;
2449 vec<slp_instance> slp_instances;
2450 slp_instance instance;
2451 int i;
2452 int min_vf = 2;
2453 unsigned n_stmts = 0;
2454
2455 bb_vinfo = new_bb_vec_info (bb);
2456 if (!bb_vinfo)
2457 return NULL;
2458
2459 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2460 {
2461 if (dump_enabled_p ())
2462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2463 "not vectorized: unhandled data-ref in basic "
2464 "block.\n");
2465
2466 destroy_bb_vec_info (bb_vinfo);
2467 return NULL;
2468 }
2469
2470 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2471 {
2472 if (dump_enabled_p ())
2473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2474 "not vectorized: not enough data-refs in "
2475 "basic block.\n");
2476
2477 destroy_bb_vec_info (bb_vinfo);
2478 return NULL;
2479 }
2480
2481 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2482 {
2483 if (dump_enabled_p ())
2484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2485 "not vectorized: unhandled data access in "
2486 "basic block.\n");
2487
2488 destroy_bb_vec_info (bb_vinfo);
2489 return NULL;
2490 }
2491
2492 vect_pattern_recog (NULL, bb_vinfo);
2493
2494 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2495 {
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2498 "not vectorized: bad data alignment in basic "
2499 "block.\n");
2500
2501 destroy_bb_vec_info (bb_vinfo);
2502 return NULL;
2503 }
2504
2505 /* Check the SLP opportunities in the basic block, analyze and build SLP
2506 trees. */
2507 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2508 {
2509 if (dump_enabled_p ())
2510 {
2511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2512 "Failed to SLP the basic block.\n");
2513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514 "not vectorized: failed to find SLP opportunities "
2515 "in basic block.\n");
2516 }
2517
2518 destroy_bb_vec_info (bb_vinfo);
2519 return NULL;
2520 }
2521
2522 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2523
2524 /* Mark all the statements that we want to vectorize as pure SLP and
2525 relevant. */
2526 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2527 {
2528 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2529 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2530 }
2531
2532 /* Mark all the statements that we do not want to vectorize. */
2533 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2534 !gsi_end_p (gsi); gsi_next (&gsi))
2535 {
2536 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2537 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2538 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2539 }
2540
2541 /* Analyze dependences. At this point all stmts not participating in
2542 vectorization have to be marked. Dependence analysis assumes
2543 that we either vectorize all SLP instances or none at all. */
2544 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2545 {
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2548 "not vectorized: unhandled data dependence "
2549 "in basic block.\n");
2550
2551 destroy_bb_vec_info (bb_vinfo);
2552 return NULL;
2553 }
2554
2555 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2556 {
2557 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2559 "not vectorized: unsupported alignment in basic "
2560 "block.\n");
2561 destroy_bb_vec_info (bb_vinfo);
2562 return NULL;
2563 }
2564
2565 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2566 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2567 {
2568 if (dump_enabled_p ())
2569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2570 "not vectorized: bad operation in basic block.\n");
2571
2572 destroy_bb_vec_info (bb_vinfo);
2573 return NULL;
2574 }
2575
2576 /* Cost model: check if the vectorization is worthwhile. */
2577 if (!unlimited_cost_model (NULL)
2578 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2579 {
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2582 "not vectorized: vectorization is not "
2583 "profitable.\n");
2584
2585 destroy_bb_vec_info (bb_vinfo);
2586 return NULL;
2587 }
2588
2589 if (dump_enabled_p ())
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "Basic block will be vectorized using SLP\n");
2592
2593 return bb_vinfo;
2594 }
2595
2596
2597 bb_vec_info
2598 vect_slp_analyze_bb (basic_block bb)
2599 {
2600 bb_vec_info bb_vinfo;
2601 int insns = 0;
2602 gimple_stmt_iterator gsi;
2603 unsigned int vector_sizes;
2604
2605 if (dump_enabled_p ())
2606 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2607
2608 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2609 {
2610 gimple stmt = gsi_stmt (gsi);
2611 if (!is_gimple_debug (stmt)
2612 && !gimple_nop_p (stmt)
2613 && gimple_code (stmt) != GIMPLE_LABEL)
2614 insns++;
2615 }
2616
2617 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2618 {
2619 if (dump_enabled_p ())
2620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621 "not vectorized: too many instructions in "
2622 "basic block.\n");
2623
2624 return NULL;
2625 }
2626
2627 /* Autodetect first vector size we try. */
2628 current_vector_size = 0;
2629 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2630
2631 while (1)
2632 {
2633 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2634 if (bb_vinfo)
2635 return bb_vinfo;
2636
2637 destroy_bb_vec_info (bb_vinfo);
2638
2639 vector_sizes &= ~current_vector_size;
2640 if (vector_sizes == 0
2641 || current_vector_size == 0)
2642 return NULL;
2643
2644 /* Try the next biggest vector size. */
2645 current_vector_size = 1 << floor_log2 (vector_sizes);
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_NOTE, vect_location,
2648 "***** Re-trying analysis with "
2649 "vector size %d\n", current_vector_size);
2650 }
2651 }
2652
2653
2654 /* For constant and loop invariant defs of SLP_NODE this function returns
2655 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2656 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2657 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2658 REDUC_INDEX is the index of the reduction operand in the statements, unless
2659 it is -1. */
2660
2661 static void
2662 vect_get_constant_vectors (tree op, slp_tree slp_node,
2663 vec<tree> *vec_oprnds,
2664 unsigned int op_num, unsigned int number_of_vectors,
2665 int reduc_index)
2666 {
2667 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2668 gimple stmt = stmts[0];
2669 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2670 unsigned nunits;
2671 tree vec_cst;
2672 tree *elts;
2673 unsigned j, number_of_places_left_in_vector;
2674 tree vector_type;
2675 tree vop;
2676 int group_size = stmts.length ();
2677 unsigned int vec_num, i;
2678 unsigned number_of_copies = 1;
2679 vec<tree> voprnds;
2680 voprnds.create (number_of_vectors);
2681 bool constant_p, is_store;
2682 tree neutral_op = NULL;
2683 enum tree_code code = gimple_expr_code (stmt);
2684 gimple def_stmt;
2685 struct loop *loop;
2686 gimple_seq ctor_seq = NULL;
2687
2688 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2689 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2690
2691 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2692 && reduc_index != -1)
2693 {
2694 op_num = reduc_index;
2695 op = gimple_op (stmt, op_num + 1);
2696 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2697 we need either neutral operands or the original operands. See
2698 get_initial_def_for_reduction() for details. */
2699 switch (code)
2700 {
2701 case WIDEN_SUM_EXPR:
2702 case DOT_PROD_EXPR:
2703 case SAD_EXPR:
2704 case PLUS_EXPR:
2705 case MINUS_EXPR:
2706 case BIT_IOR_EXPR:
2707 case BIT_XOR_EXPR:
2708 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2709 neutral_op = build_real (TREE_TYPE (op), dconst0);
2710 else
2711 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2712
2713 break;
2714
2715 case MULT_EXPR:
2716 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2717 neutral_op = build_real (TREE_TYPE (op), dconst1);
2718 else
2719 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2720
2721 break;
2722
2723 case BIT_AND_EXPR:
2724 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2725 break;
2726
2727 /* For MIN/MAX we don't have an easy neutral operand but
2728 the initial values can be used fine here. Only for
2729 a reduction chain we have to force a neutral element. */
2730 case MAX_EXPR:
2731 case MIN_EXPR:
2732 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2733 neutral_op = NULL;
2734 else
2735 {
2736 def_stmt = SSA_NAME_DEF_STMT (op);
2737 loop = (gimple_bb (stmt))->loop_father;
2738 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2739 loop_preheader_edge (loop));
2740 }
2741 break;
2742
2743 default:
2744 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2745 neutral_op = NULL;
2746 }
2747 }
2748
2749 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2750 {
2751 is_store = true;
2752 op = gimple_assign_rhs1 (stmt);
2753 }
2754 else
2755 is_store = false;
2756
2757 gcc_assert (op);
2758
2759 if (CONSTANT_CLASS_P (op))
2760 constant_p = true;
2761 else
2762 constant_p = false;
2763
2764 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2765 created vectors. It is greater than 1 if unrolling is performed.
2766
2767 For example, we have two scalar operands, s1 and s2 (e.g., group of
2768 strided accesses of size two), while NUNITS is four (i.e., four scalars
2769 of this type can be packed in a vector). The output vector will contain
2770 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2771 will be 2).
2772
2773 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2774 containing the operands.
2775
2776 For example, NUNITS is four as before, and the group size is 8
2777 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2778 {s5, s6, s7, s8}. */
2779
2780 number_of_copies = nunits * number_of_vectors / group_size;
2781
2782 number_of_places_left_in_vector = nunits;
2783 elts = XALLOCAVEC (tree, nunits);
2784 bool place_after_defs = false;
2785 for (j = 0; j < number_of_copies; j++)
2786 {
2787 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2788 {
2789 if (is_store)
2790 op = gimple_assign_rhs1 (stmt);
2791 else
2792 {
2793 switch (code)
2794 {
2795 case COND_EXPR:
2796 if (op_num == 0 || op_num == 1)
2797 {
2798 tree cond = gimple_assign_rhs1 (stmt);
2799 op = TREE_OPERAND (cond, op_num);
2800 }
2801 else
2802 {
2803 if (op_num == 2)
2804 op = gimple_assign_rhs2 (stmt);
2805 else
2806 op = gimple_assign_rhs3 (stmt);
2807 }
2808 break;
2809
2810 case CALL_EXPR:
2811 op = gimple_call_arg (stmt, op_num);
2812 break;
2813
2814 case LSHIFT_EXPR:
2815 case RSHIFT_EXPR:
2816 case LROTATE_EXPR:
2817 case RROTATE_EXPR:
2818 op = gimple_op (stmt, op_num + 1);
2819 /* Unlike the other binary operators, shifts/rotates have
2820 the shift count being int, instead of the same type as
2821 the lhs, so make sure the scalar is the right type if
2822 we are dealing with vectors of
2823 long long/long/short/char. */
2824 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2825 op = fold_convert (TREE_TYPE (vector_type), op);
2826 break;
2827
2828 default:
2829 op = gimple_op (stmt, op_num + 1);
2830 break;
2831 }
2832 }
2833
2834 if (reduc_index != -1)
2835 {
2836 loop = (gimple_bb (stmt))->loop_father;
2837 def_stmt = SSA_NAME_DEF_STMT (op);
2838
2839 gcc_assert (loop);
2840
2841 /* Get the def before the loop. In reduction chain we have only
2842 one initial value. */
2843 if ((j != (number_of_copies - 1)
2844 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2845 && i != 0))
2846 && neutral_op)
2847 op = neutral_op;
2848 else
2849 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2850 loop_preheader_edge (loop));
2851 }
2852
2853 /* Create 'vect_ = {op0,op1,...,opn}'. */
2854 number_of_places_left_in_vector--;
2855 tree orig_op = op;
2856 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2857 {
2858 if (CONSTANT_CLASS_P (op))
2859 {
2860 op = fold_unary (VIEW_CONVERT_EXPR,
2861 TREE_TYPE (vector_type), op);
2862 gcc_assert (op && CONSTANT_CLASS_P (op));
2863 }
2864 else
2865 {
2866 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2867 gimple init_stmt;
2868 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2869 init_stmt
2870 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2871 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2872 op = new_temp;
2873 }
2874 }
2875 elts[number_of_places_left_in_vector] = op;
2876 if (!CONSTANT_CLASS_P (op))
2877 constant_p = false;
2878 if (TREE_CODE (orig_op) == SSA_NAME
2879 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2880 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2881 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2882 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2883 place_after_defs = true;
2884
2885 if (number_of_places_left_in_vector == 0)
2886 {
2887 number_of_places_left_in_vector = nunits;
2888
2889 if (constant_p)
2890 vec_cst = build_vector (vector_type, elts);
2891 else
2892 {
2893 vec<constructor_elt, va_gc> *v;
2894 unsigned k;
2895 vec_alloc (v, nunits);
2896 for (k = 0; k < nunits; ++k)
2897 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2898 vec_cst = build_constructor (vector_type, v);
2899 }
2900 tree init;
2901 gimple_stmt_iterator gsi;
2902 if (place_after_defs)
2903 {
2904 gsi = gsi_for_stmt
2905 (vect_find_last_scalar_stmt_in_slp (slp_node));
2906 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2907 }
2908 else
2909 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2910 if (ctor_seq != NULL)
2911 {
2912 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2913 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2914 GSI_SAME_STMT);
2915 ctor_seq = NULL;
2916 }
2917 voprnds.quick_push (init);
2918 place_after_defs = false;
2919 }
2920 }
2921 }
2922
2923 /* Since the vectors are created in the reverse order, we should invert
2924 them. */
2925 vec_num = voprnds.length ();
2926 for (j = vec_num; j != 0; j--)
2927 {
2928 vop = voprnds[j - 1];
2929 vec_oprnds->quick_push (vop);
2930 }
2931
2932 voprnds.release ();
2933
2934 /* In case that VF is greater than the unrolling factor needed for the SLP
2935 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2936 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2937 to replicate the vectors. */
2938 while (number_of_vectors > vec_oprnds->length ())
2939 {
2940 tree neutral_vec = NULL;
2941
2942 if (neutral_op)
2943 {
2944 if (!neutral_vec)
2945 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2946
2947 vec_oprnds->quick_push (neutral_vec);
2948 }
2949 else
2950 {
2951 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2952 vec_oprnds->quick_push (vop);
2953 }
2954 }
2955 }
2956
2957
2958 /* Get vectorized definitions from SLP_NODE that contains corresponding
2959 vectorized def-stmts. */
2960
2961 static void
2962 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2963 {
2964 tree vec_oprnd;
2965 gimple vec_def_stmt;
2966 unsigned int i;
2967
2968 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2969
2970 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2971 {
2972 gcc_assert (vec_def_stmt);
2973 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2974 vec_oprnds->quick_push (vec_oprnd);
2975 }
2976 }
2977
2978
2979 /* Get vectorized definitions for SLP_NODE.
2980 If the scalar definitions are loop invariants or constants, collect them and
2981 call vect_get_constant_vectors() to create vector stmts.
2982 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2983 must be stored in the corresponding child of SLP_NODE, and we call
2984 vect_get_slp_vect_defs () to retrieve them. */
2985
2986 void
2987 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2988 vec<vec<tree> > *vec_oprnds, int reduc_index)
2989 {
2990 gimple first_stmt;
2991 int number_of_vects = 0, i;
2992 unsigned int child_index = 0;
2993 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2994 slp_tree child = NULL;
2995 vec<tree> vec_defs;
2996 tree oprnd;
2997 bool vectorized_defs;
2998
2999 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3000 FOR_EACH_VEC_ELT (ops, i, oprnd)
3001 {
3002 /* For each operand we check if it has vectorized definitions in a child
3003 node or we need to create them (for invariants and constants). We
3004 check if the LHS of the first stmt of the next child matches OPRND.
3005 If it does, we found the correct child. Otherwise, we call
3006 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3007 to check this child node for the next operand. */
3008 vectorized_defs = false;
3009 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3010 {
3011 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3012
3013 /* We have to check both pattern and original def, if available. */
3014 if (child)
3015 {
3016 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
3017 gimple related
3018 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
3019
3020 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
3021 || (related
3022 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
3023 {
3024 /* The number of vector defs is determined by the number of
3025 vector statements in the node from which we get those
3026 statements. */
3027 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3028 vectorized_defs = true;
3029 child_index++;
3030 }
3031 }
3032 else
3033 child_index++;
3034 }
3035
3036 if (!vectorized_defs)
3037 {
3038 if (i == 0)
3039 {
3040 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3041 /* Number of vector stmts was calculated according to LHS in
3042 vect_schedule_slp_instance (), fix it by replacing LHS with
3043 RHS, if necessary. See vect_get_smallest_scalar_type () for
3044 details. */
3045 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3046 &rhs_size_unit);
3047 if (rhs_size_unit != lhs_size_unit)
3048 {
3049 number_of_vects *= rhs_size_unit;
3050 number_of_vects /= lhs_size_unit;
3051 }
3052 }
3053 }
3054
3055 /* Allocate memory for vectorized defs. */
3056 vec_defs = vNULL;
3057 vec_defs.create (number_of_vects);
3058
3059 /* For reduction defs we call vect_get_constant_vectors (), since we are
3060 looking for initial loop invariant values. */
3061 if (vectorized_defs && reduc_index == -1)
3062 /* The defs are already vectorized. */
3063 vect_get_slp_vect_defs (child, &vec_defs);
3064 else
3065 /* Build vectors from scalar defs. */
3066 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3067 number_of_vects, reduc_index);
3068
3069 vec_oprnds->quick_push (vec_defs);
3070
3071 /* For reductions, we only need initial values. */
3072 if (reduc_index != -1)
3073 return;
3074 }
3075 }
3076
3077
3078 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3079 building a vector of type MASK_TYPE from it) and two input vectors placed in
3080 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3081 shifting by STRIDE elements of DR_CHAIN for every copy.
3082 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3083 copies).
3084 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3085 the created stmts must be inserted. */
3086
3087 static inline void
3088 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3089 tree mask, int first_vec_indx, int second_vec_indx,
3090 gimple_stmt_iterator *gsi, slp_tree node,
3091 tree vectype, vec<tree> dr_chain,
3092 int ncopies, int vect_stmts_counter)
3093 {
3094 tree perm_dest;
3095 gimple perm_stmt = NULL;
3096 stmt_vec_info next_stmt_info;
3097 int i, stride;
3098 tree first_vec, second_vec, data_ref;
3099
3100 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3101
3102 /* Initialize the vect stmts of NODE to properly insert the generated
3103 stmts later. */
3104 for (i = SLP_TREE_VEC_STMTS (node).length ();
3105 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3106 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3107
3108 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3109 for (i = 0; i < ncopies; i++)
3110 {
3111 first_vec = dr_chain[first_vec_indx];
3112 second_vec = dr_chain[second_vec_indx];
3113
3114 /* Generate the permute statement. */
3115 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3116 first_vec, second_vec, mask);
3117 data_ref = make_ssa_name (perm_dest, perm_stmt);
3118 gimple_set_lhs (perm_stmt, data_ref);
3119 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3120
3121 /* Store the vector statement in NODE. */
3122 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3123
3124 first_vec_indx += stride;
3125 second_vec_indx += stride;
3126 }
3127
3128 /* Mark the scalar stmt as vectorized. */
3129 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3130 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3131 }
3132
3133
3134 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3135 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3136 representation. Check that the mask is valid and return FALSE if not.
3137 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3138 the next vector, i.e., the current first vector is not needed. */
3139
3140 static bool
3141 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3142 int mask_nunits, bool only_one_vec, int index,
3143 unsigned char *mask, int *current_mask_element,
3144 bool *need_next_vector, int *number_of_mask_fixes,
3145 bool *mask_fixed, bool *needs_first_vector)
3146 {
3147 int i;
3148
3149 /* Convert to target specific representation. */
3150 *current_mask_element = first_mask_element + m;
3151 /* Adjust the value in case it's a mask for second and third vectors. */
3152 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3153
3154 if (*current_mask_element < 0)
3155 {
3156 if (dump_enabled_p ())
3157 {
3158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159 "permutation requires past vector ");
3160 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3161 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3162 }
3163 return false;
3164 }
3165
3166 if (*current_mask_element < mask_nunits)
3167 *needs_first_vector = true;
3168
3169 /* We have only one input vector to permute but the mask accesses values in
3170 the next vector as well. */
3171 if (only_one_vec && *current_mask_element >= mask_nunits)
3172 {
3173 if (dump_enabled_p ())
3174 {
3175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3176 "permutation requires at least two vectors ");
3177 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3178 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3179 }
3180
3181 return false;
3182 }
3183
3184 /* The mask requires the next vector. */
3185 while (*current_mask_element >= mask_nunits * 2)
3186 {
3187 if (*needs_first_vector || *mask_fixed)
3188 {
3189 /* We either need the first vector too or have already moved to the
3190 next vector. In both cases, this permutation needs three
3191 vectors. */
3192 if (dump_enabled_p ())
3193 {
3194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3195 "permutation requires at "
3196 "least three vectors ");
3197 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3198 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3199 }
3200
3201 return false;
3202 }
3203
3204 /* We move to the next vector, dropping the first one and working with
3205 the second and the third - we need to adjust the values of the mask
3206 accordingly. */
3207 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3208
3209 for (i = 0; i < index; i++)
3210 mask[i] -= mask_nunits * *number_of_mask_fixes;
3211
3212 (*number_of_mask_fixes)++;
3213 *mask_fixed = true;
3214 }
3215
3216 *need_next_vector = *mask_fixed;
3217
3218 /* This was the last element of this mask. Start a new one. */
3219 if (index == mask_nunits - 1)
3220 {
3221 *number_of_mask_fixes = 1;
3222 *mask_fixed = false;
3223 *needs_first_vector = false;
3224 }
3225
3226 return true;
3227 }
3228
3229
3230 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3231 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3232 permute statements for the SLP node NODE of the SLP instance
3233 SLP_NODE_INSTANCE. */
3234
3235 bool
3236 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3237 gimple_stmt_iterator *gsi, int vf,
3238 slp_instance slp_node_instance, bool analyze_only)
3239 {
3240 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3241 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3242 tree mask_element_type = NULL_TREE, mask_type;
3243 int i, j, k, nunits, vec_index = 0, scalar_index;
3244 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3245 gimple next_scalar_stmt;
3246 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3247 int first_mask_element;
3248 int index, unroll_factor, current_mask_element, ncopies;
3249 unsigned char *mask;
3250 bool only_one_vec = false, need_next_vector = false;
3251 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3252 int number_of_mask_fixes = 1;
3253 bool mask_fixed = false;
3254 bool needs_first_vector = false;
3255 machine_mode mode;
3256
3257 mode = TYPE_MODE (vectype);
3258
3259 if (!can_vec_perm_p (mode, false, NULL))
3260 {
3261 if (dump_enabled_p ())
3262 {
3263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3264 "no vect permute for ");
3265 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3266 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3267 }
3268 return false;
3269 }
3270
3271 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3272 same size as the vector element being permuted. */
3273 mask_element_type = lang_hooks.types.type_for_mode
3274 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3275 mask_type = get_vectype_for_scalar_type (mask_element_type);
3276 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3277 mask = XALLOCAVEC (unsigned char, nunits);
3278 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3279
3280 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3281 unrolling factor. */
3282 orig_vec_stmts_num = group_size *
3283 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3284 if (orig_vec_stmts_num == 1)
3285 only_one_vec = true;
3286
3287 /* Number of copies is determined by the final vectorization factor
3288 relatively to SLP_NODE_INSTANCE unrolling factor. */
3289 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3290
3291 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3292 return false;
3293
3294 /* Generate permutation masks for every NODE. Number of masks for each NODE
3295 is equal to GROUP_SIZE.
3296 E.g., we have a group of three nodes with three loads from the same
3297 location in each node, and the vector size is 4. I.e., we have a
3298 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3299 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3300 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3301 ...
3302
3303 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3304 The last mask is illegal since we assume two operands for permute
3305 operation, and the mask element values can't be outside that range.
3306 Hence, the last mask must be converted into {2,5,5,5}.
3307 For the first two permutations we need the first and the second input
3308 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3309 we need the second and the third vectors: {b1,c1,a2,b2} and
3310 {c2,a3,b3,c3}. */
3311
3312 {
3313 scalar_index = 0;
3314 index = 0;
3315 vect_stmts_counter = 0;
3316 vec_index = 0;
3317 first_vec_index = vec_index++;
3318 if (only_one_vec)
3319 second_vec_index = first_vec_index;
3320 else
3321 second_vec_index = vec_index++;
3322
3323 for (j = 0; j < unroll_factor; j++)
3324 {
3325 for (k = 0; k < group_size; k++)
3326 {
3327 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3328 first_mask_element = i + j * group_size;
3329 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3330 nunits, only_one_vec, index,
3331 mask, &current_mask_element,
3332 &need_next_vector,
3333 &number_of_mask_fixes, &mask_fixed,
3334 &needs_first_vector))
3335 return false;
3336 gcc_assert (current_mask_element >= 0
3337 && current_mask_element < 2 * nunits);
3338 mask[index++] = current_mask_element;
3339
3340 if (index == nunits)
3341 {
3342 index = 0;
3343 if (!can_vec_perm_p (mode, false, mask))
3344 {
3345 if (dump_enabled_p ())
3346 {
3347 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3348 vect_location,
3349 "unsupported vect permute { ");
3350 for (i = 0; i < nunits; ++i)
3351 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3352 mask[i]);
3353 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3354 }
3355 return false;
3356 }
3357
3358 if (!analyze_only)
3359 {
3360 int l;
3361 tree mask_vec, *mask_elts;
3362 mask_elts = XALLOCAVEC (tree, nunits);
3363 for (l = 0; l < nunits; ++l)
3364 mask_elts[l] = build_int_cst (mask_element_type,
3365 mask[l]);
3366 mask_vec = build_vector (mask_type, mask_elts);
3367
3368 if (need_next_vector)
3369 {
3370 first_vec_index = second_vec_index;
3371 second_vec_index = vec_index;
3372 }
3373
3374 next_scalar_stmt
3375 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3376
3377 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3378 mask_vec, first_vec_index, second_vec_index,
3379 gsi, node, vectype, dr_chain,
3380 ncopies, vect_stmts_counter++);
3381 }
3382 }
3383 }
3384 }
3385 }
3386
3387 return true;
3388 }
3389
3390
3391
3392 /* Vectorize SLP instance tree in postorder. */
3393
3394 static bool
3395 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3396 unsigned int vectorization_factor)
3397 {
3398 gimple stmt;
3399 bool grouped_store, is_store;
3400 gimple_stmt_iterator si;
3401 stmt_vec_info stmt_info;
3402 unsigned int vec_stmts_size, nunits, group_size;
3403 tree vectype;
3404 int i;
3405 slp_tree child;
3406
3407 if (!node)
3408 return false;
3409
3410 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3411 vect_schedule_slp_instance (child, instance, vectorization_factor);
3412
3413 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3414 stmt_info = vinfo_for_stmt (stmt);
3415
3416 /* VECTYPE is the type of the destination. */
3417 vectype = STMT_VINFO_VECTYPE (stmt_info);
3418 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3419 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3420
3421 /* For each SLP instance calculate number of vector stmts to be created
3422 for the scalar stmts in each node of the SLP tree. Number of vector
3423 elements in one vector iteration is the number of scalar elements in
3424 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3425 size.
3426 Unless this is a SLP reduction in which case the number of vector
3427 stmts is equal to the number of vector stmts of the children. */
3428 if (GROUP_FIRST_ELEMENT (stmt_info)
3429 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3430 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3431 else
3432 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3433
3434 if (!SLP_TREE_VEC_STMTS (node).exists ())
3435 {
3436 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3437 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3438 }
3439
3440 if (dump_enabled_p ())
3441 {
3442 dump_printf_loc (MSG_NOTE,vect_location,
3443 "------>vectorizing SLP node starting from: ");
3444 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3445 dump_printf (MSG_NOTE, "\n");
3446 }
3447
3448 /* Vectorized stmts go before the last scalar stmt which is where
3449 all uses are ready. */
3450 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3451
3452 /* Mark the first element of the reduction chain as reduction to properly
3453 transform the node. In the analysis phase only the last element of the
3454 chain is marked as reduction. */
3455 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3456 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3457 {
3458 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3459 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3460 }
3461
3462 /* Handle two-operation SLP nodes by vectorizing the group with
3463 both operations and then performing a merge. */
3464 if (SLP_TREE_TWO_OPERATORS (node))
3465 {
3466 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3467 enum tree_code ocode;
3468 gimple ostmt;
3469 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3470 bool allsame = true;
3471 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3472 if (gimple_assign_rhs_code (ostmt) != code0)
3473 {
3474 mask[i] = 1;
3475 allsame = false;
3476 ocode = gimple_assign_rhs_code (ostmt);
3477 }
3478 else
3479 mask[i] = 0;
3480 if (!allsame)
3481 {
3482 vec<gimple> v0;
3483 vec<gimple> v1;
3484 unsigned j;
3485 tree tmask = NULL_TREE;
3486 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3487 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3488 SLP_TREE_VEC_STMTS (node).truncate (0);
3489 gimple_assign_set_rhs_code (stmt, ocode);
3490 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3491 gimple_assign_set_rhs_code (stmt, code0);
3492 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3493 SLP_TREE_VEC_STMTS (node).truncate (0);
3494 tree meltype = build_nonstandard_integer_type
3495 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3496 tree mvectype = get_same_sized_vectype (meltype, vectype);
3497 unsigned k = 0, l;
3498 for (j = 0; j < v0.length (); ++j)
3499 {
3500 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3501 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3502 {
3503 if (k >= group_size)
3504 k = 0;
3505 melts[l] = build_int_cst
3506 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3507 }
3508 tmask = build_vector (mvectype, melts);
3509
3510 /* ??? Not all targets support a VEC_PERM_EXPR with a
3511 constant mask that would translate to a vec_merge RTX
3512 (with their vec_perm_const_ok). We can either not
3513 vectorize in that case or let veclower do its job.
3514 Unfortunately that isn't too great and at least for
3515 plus/minus we'd eventually like to match targets
3516 vector addsub instructions. */
3517 gimple vstmt;
3518 vstmt = gimple_build_assign (make_ssa_name (vectype),
3519 VEC_PERM_EXPR,
3520 gimple_assign_lhs (v0[j]),
3521 gimple_assign_lhs (v1[j]), tmask);
3522 vect_finish_stmt_generation (stmt, vstmt, &si);
3523 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3524 }
3525 v0.release ();
3526 v1.release ();
3527 return false;
3528 }
3529 }
3530 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3531 return is_store;
3532 }
3533
3534 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3535 For loop vectorization this is done in vectorizable_call, but for SLP
3536 it needs to be deferred until end of vect_schedule_slp, because multiple
3537 SLP instances may refer to the same scalar stmt. */
3538
3539 static void
3540 vect_remove_slp_scalar_calls (slp_tree node)
3541 {
3542 gimple stmt, new_stmt;
3543 gimple_stmt_iterator gsi;
3544 int i;
3545 slp_tree child;
3546 tree lhs;
3547 stmt_vec_info stmt_info;
3548
3549 if (!node)
3550 return;
3551
3552 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3553 vect_remove_slp_scalar_calls (child);
3554
3555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3556 {
3557 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3558 continue;
3559 stmt_info = vinfo_for_stmt (stmt);
3560 if (stmt_info == NULL
3561 || is_pattern_stmt_p (stmt_info)
3562 || !PURE_SLP_STMT (stmt_info))
3563 continue;
3564 lhs = gimple_call_lhs (stmt);
3565 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3566 set_vinfo_for_stmt (new_stmt, stmt_info);
3567 set_vinfo_for_stmt (stmt, NULL);
3568 STMT_VINFO_STMT (stmt_info) = new_stmt;
3569 gsi = gsi_for_stmt (stmt);
3570 gsi_replace (&gsi, new_stmt, false);
3571 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3572 }
3573 }
3574
3575 /* Generate vector code for all SLP instances in the loop/basic block. */
3576
3577 bool
3578 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3579 {
3580 vec<slp_instance> slp_instances;
3581 slp_instance instance;
3582 unsigned int i, vf;
3583 bool is_store = false;
3584
3585 if (loop_vinfo)
3586 {
3587 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3588 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3589 }
3590 else
3591 {
3592 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3593 vf = 1;
3594 }
3595
3596 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3597 {
3598 /* Schedule the tree of INSTANCE. */
3599 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3600 instance, vf);
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE, vect_location,
3603 "vectorizing stmts using SLP.\n");
3604 }
3605
3606 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3607 {
3608 slp_tree root = SLP_INSTANCE_TREE (instance);
3609 gimple store;
3610 unsigned int j;
3611 gimple_stmt_iterator gsi;
3612
3613 /* Remove scalar call stmts. Do not do this for basic-block
3614 vectorization as not all uses may be vectorized.
3615 ??? Why should this be necessary? DCE should be able to
3616 remove the stmts itself.
3617 ??? For BB vectorization we can as well remove scalar
3618 stmts starting from the SLP tree root if they have no
3619 uses. */
3620 if (loop_vinfo)
3621 vect_remove_slp_scalar_calls (root);
3622
3623 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3624 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3625 {
3626 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3627 break;
3628
3629 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3630 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3631 /* Free the attached stmt_vec_info and remove the stmt. */
3632 gsi = gsi_for_stmt (store);
3633 unlink_stmt_vdef (store);
3634 gsi_remove (&gsi, true);
3635 release_defs (store);
3636 free_stmt_vec_info (store);
3637 }
3638 }
3639
3640 return is_store;
3641 }
3642
3643
3644 /* Vectorize the basic block. */
3645
3646 void
3647 vect_slp_transform_bb (basic_block bb)
3648 {
3649 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3650 gimple_stmt_iterator si;
3651
3652 gcc_assert (bb_vinfo);
3653
3654 if (dump_enabled_p ())
3655 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3656
3657 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3658 {
3659 gimple stmt = gsi_stmt (si);
3660 stmt_vec_info stmt_info;
3661
3662 if (dump_enabled_p ())
3663 {
3664 dump_printf_loc (MSG_NOTE, vect_location,
3665 "------>SLPing statement: ");
3666 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3667 dump_printf (MSG_NOTE, "\n");
3668 }
3669
3670 stmt_info = vinfo_for_stmt (stmt);
3671 gcc_assert (stmt_info);
3672
3673 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3674 if (STMT_SLP_TYPE (stmt_info))
3675 {
3676 vect_schedule_slp (NULL, bb_vinfo);
3677 break;
3678 }
3679 }
3680
3681 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_NOTE, vect_location,
3683 "BASIC BLOCK VECTORIZED\n");
3684
3685 destroy_bb_vec_info (bb_vinfo);
3686 }