re PR tree-optimization/37021 (Fortran Complex reduction / multiplication not vectorized)
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h" /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81 Return the basic block location if succeed and NULL if not. */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86 gimple stmt = NULL;
87 gimple_stmt_iterator si;
88
89 if (!bb)
90 return UNKNOWN_LOCATION;
91
92 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93 {
94 stmt = gsi_stmt (si);
95 if (gimple_location (stmt) != UNKNOWN_LOCATION)
96 return gimple_location (stmt);
97 }
98
99 return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108 int i;
109 slp_tree child;
110
111 if (!node)
112 return;
113
114 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115 vect_free_slp_tree (child);
116
117 SLP_TREE_CHILDREN (node).release ();
118 SLP_TREE_SCALAR_STMTS (node).release ();
119 SLP_TREE_VEC_STMTS (node).release ();
120 SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122 free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance. */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132 SLP_INSTANCE_LOADS (instance).release ();
133 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134 free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS. */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143 slp_tree node;
144 gimple stmt = scalar_stmts[0];
145 unsigned int nops;
146
147 if (is_gimple_call (stmt))
148 nops = gimple_call_num_args (stmt);
149 else if (is_gimple_assign (stmt))
150 {
151 nops = gimple_num_ops (stmt) - 1;
152 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153 nops++;
154 }
155 else
156 return NULL;
157
158 node = XNEW (struct _slp_tree);
159 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160 SLP_TREE_VEC_STMTS (node).create (0);
161 SLP_TREE_CHILDREN (node).create (nops);
162 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163 SLP_TREE_TWO_OPERATORS (node) = false;
164
165 return node;
166 }
167
168
169 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
170 operand. */
171 static vec<slp_oprnd_info>
172 vect_create_oprnd_info (int nops, int group_size)
173 {
174 int i;
175 slp_oprnd_info oprnd_info;
176 vec<slp_oprnd_info> oprnds_info;
177
178 oprnds_info.create (nops);
179 for (i = 0; i < nops; i++)
180 {
181 oprnd_info = XNEW (struct _slp_oprnd_info);
182 oprnd_info->def_stmts.create (group_size);
183 oprnd_info->first_dt = vect_uninitialized_def;
184 oprnd_info->first_op_type = NULL_TREE;
185 oprnd_info->first_pattern = false;
186 oprnds_info.quick_push (oprnd_info);
187 }
188
189 return oprnds_info;
190 }
191
192
193 /* Free operands info. */
194
195 static void
196 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
197 {
198 int i;
199 slp_oprnd_info oprnd_info;
200
201 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
202 {
203 oprnd_info->def_stmts.release ();
204 XDELETE (oprnd_info);
205 }
206
207 oprnds_info.release ();
208 }
209
210
211 /* Find the place of the data-ref in STMT in the interleaving chain that starts
212 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
213
214 static int
215 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
216 {
217 gimple next_stmt = first_stmt;
218 int result = 0;
219
220 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
221 return -1;
222
223 do
224 {
225 if (next_stmt == stmt)
226 return result;
227 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228 if (next_stmt)
229 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
230 }
231 while (next_stmt);
232
233 return -1;
234 }
235
236
237 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
238 they are of a valid type and that they match the defs of the first stmt of
239 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
240 return -1, if the error could be corrected by swapping operands of the
241 operation return 1, if everything is ok return 0. */
242
243 static int
244 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
245 gimple stmt, bool first,
246 vec<slp_oprnd_info> *oprnds_info)
247 {
248 tree oprnd;
249 unsigned int i, number_of_oprnds;
250 tree def;
251 gimple def_stmt;
252 enum vect_def_type dt = vect_uninitialized_def;
253 struct loop *loop = NULL;
254 bool pattern = false;
255 slp_oprnd_info oprnd_info;
256 int first_op_idx = 1;
257 bool commutative = false;
258 bool first_op_cond = false;
259
260 if (loop_vinfo)
261 loop = LOOP_VINFO_LOOP (loop_vinfo);
262
263 if (is_gimple_call (stmt))
264 {
265 number_of_oprnds = gimple_call_num_args (stmt);
266 first_op_idx = 3;
267 }
268 else if (is_gimple_assign (stmt))
269 {
270 enum tree_code code = gimple_assign_rhs_code (stmt);
271 number_of_oprnds = gimple_num_ops (stmt) - 1;
272 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
273 {
274 first_op_cond = true;
275 commutative = true;
276 number_of_oprnds++;
277 }
278 else
279 commutative = commutative_tree_code (code);
280 }
281 else
282 return -1;
283
284 bool swapped = false;
285 for (i = 0; i < number_of_oprnds; i++)
286 {
287 again:
288 if (first_op_cond)
289 {
290 if (i == 0 || i == 1)
291 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
292 swapped ? !i : i);
293 else
294 oprnd = gimple_op (stmt, first_op_idx + i - 1);
295 }
296 else
297 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
298
299 oprnd_info = (*oprnds_info)[i];
300
301 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
302 &def, &dt)
303 || (!def_stmt && dt != vect_constant_def))
304 {
305 if (dump_enabled_p ())
306 {
307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
308 "Build SLP failed: can't find def for ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
310 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
311 }
312
313 return -1;
314 }
315
316 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
317 from the pattern. Check that all the stmts of the node are in the
318 pattern. */
319 if (def_stmt && gimple_bb (def_stmt)
320 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
321 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
322 && gimple_code (def_stmt) != GIMPLE_PHI))
323 && vinfo_for_stmt (def_stmt)
324 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
325 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
326 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
327 {
328 pattern = true;
329 if (!first && !oprnd_info->first_pattern)
330 {
331 if (i == 0
332 && !swapped
333 && commutative)
334 {
335 swapped = true;
336 goto again;
337 }
338
339 if (dump_enabled_p ())
340 {
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "Build SLP failed: some of the stmts"
343 " are in a pattern, and others are not ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
345 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
346 }
347
348 return 1;
349 }
350
351 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
352 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
353
354 if (dt == vect_unknown_def_type)
355 {
356 if (dump_enabled_p ())
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "Unsupported pattern.\n");
359 return -1;
360 }
361
362 switch (gimple_code (def_stmt))
363 {
364 case GIMPLE_PHI:
365 def = gimple_phi_result (def_stmt);
366 break;
367
368 case GIMPLE_ASSIGN:
369 def = gimple_assign_lhs (def_stmt);
370 break;
371
372 default:
373 if (dump_enabled_p ())
374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
375 "unsupported defining stmt:\n");
376 return -1;
377 }
378 }
379
380 if (first)
381 {
382 oprnd_info->first_dt = dt;
383 oprnd_info->first_pattern = pattern;
384 oprnd_info->first_op_type = TREE_TYPE (oprnd);
385 }
386 else
387 {
388 /* Not first stmt of the group, check that the def-stmt/s match
389 the def-stmt/s of the first stmt. Allow different definition
390 types for reduction chains: the first stmt must be a
391 vect_reduction_def (a phi node), and the rest
392 vect_internal_def. */
393 if (((oprnd_info->first_dt != dt
394 && !(oprnd_info->first_dt == vect_reduction_def
395 && dt == vect_internal_def)
396 && !((oprnd_info->first_dt == vect_external_def
397 || oprnd_info->first_dt == vect_constant_def)
398 && (dt == vect_external_def
399 || dt == vect_constant_def)))
400 || !types_compatible_p (oprnd_info->first_op_type,
401 TREE_TYPE (oprnd))))
402 {
403 /* Try swapping operands if we got a mismatch. */
404 if (i == 0
405 && !swapped
406 && commutative)
407 {
408 swapped = true;
409 goto again;
410 }
411
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "Build SLP failed: different types\n");
415
416 return 1;
417 }
418 }
419
420 /* Check the types of the definitions. */
421 switch (dt)
422 {
423 case vect_constant_def:
424 case vect_external_def:
425 case vect_reduction_def:
426 break;
427
428 case vect_internal_def:
429 oprnd_info->def_stmts.quick_push (def_stmt);
430 break;
431
432 default:
433 /* FORNOW: Not supported. */
434 if (dump_enabled_p ())
435 {
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "Build SLP failed: illegal type of def ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
439 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
440 }
441
442 return -1;
443 }
444 }
445
446 /* Swap operands. */
447 if (swapped)
448 {
449 if (first_op_cond)
450 {
451 tree cond = gimple_assign_rhs1 (stmt);
452 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
453 &TREE_OPERAND (cond, 1));
454 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
455 }
456 else
457 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
458 gimple_assign_rhs2_ptr (stmt));
459 }
460
461 return 0;
462 }
463
464
465 /* Verify if the scalar stmts STMTS are isomorphic, require data
466 permutation or are of unsupported types of operation. Return
467 true if they are, otherwise return false and indicate in *MATCHES
468 which stmts are not isomorphic to the first one. If MATCHES[0]
469 is false then this indicates the comparison could not be
470 carried out or the stmts will never be vectorized by SLP. */
471
472 static bool
473 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
474 vec<gimple> stmts, unsigned int group_size,
475 unsigned nops, unsigned int *max_nunits,
476 unsigned int vectorization_factor, bool *matches,
477 bool *two_operators)
478 {
479 unsigned int i;
480 gimple first_stmt = stmts[0], stmt = stmts[0];
481 enum tree_code first_stmt_code = ERROR_MARK;
482 enum tree_code alt_stmt_code = ERROR_MARK;
483 enum tree_code rhs_code = ERROR_MARK;
484 enum tree_code first_cond_code = ERROR_MARK;
485 tree lhs;
486 bool need_same_oprnds = false;
487 tree vectype, scalar_type, first_op1 = NULL_TREE;
488 optab optab;
489 int icode;
490 machine_mode optab_op2_mode;
491 machine_mode vec_mode;
492 struct data_reference *first_dr;
493 HOST_WIDE_INT dummy;
494 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
495 tree cond;
496
497 /* For every stmt in NODE find its def stmt/s. */
498 FOR_EACH_VEC_ELT (stmts, i, stmt)
499 {
500 matches[i] = false;
501
502 if (dump_enabled_p ())
503 {
504 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
505 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
506 dump_printf (MSG_NOTE, "\n");
507 }
508
509 /* Fail to vectorize statements marked as unvectorizable. */
510 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
511 {
512 if (dump_enabled_p ())
513 {
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "Build SLP failed: unvectorizable statement ");
516 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
517 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
518 }
519 /* Fatal mismatch. */
520 matches[0] = false;
521 return false;
522 }
523
524 lhs = gimple_get_lhs (stmt);
525 if (lhs == NULL_TREE)
526 {
527 if (dump_enabled_p ())
528 {
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "Build SLP failed: not GIMPLE_ASSIGN nor "
531 "GIMPLE_CALL ");
532 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
533 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
534 }
535 /* Fatal mismatch. */
536 matches[0] = false;
537 return false;
538 }
539
540 if (is_gimple_assign (stmt)
541 && gimple_assign_rhs_code (stmt) == COND_EXPR
542 && (cond = gimple_assign_rhs1 (stmt))
543 && !COMPARISON_CLASS_P (cond))
544 {
545 if (dump_enabled_p ())
546 {
547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
548 "Build SLP failed: condition is not "
549 "comparison ");
550 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
551 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
552 }
553 /* Fatal mismatch. */
554 matches[0] = false;
555 return false;
556 }
557
558 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
559 vectype = get_vectype_for_scalar_type (scalar_type);
560 if (!vectype)
561 {
562 if (dump_enabled_p ())
563 {
564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
565 "Build SLP failed: unsupported data-type ");
566 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
567 scalar_type);
568 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
569 }
570 /* Fatal mismatch. */
571 matches[0] = false;
572 return false;
573 }
574
575 /* If populating the vector type requires unrolling then fail
576 before adjusting *max_nunits for basic-block vectorization. */
577 if (bb_vinfo
578 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
579 {
580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
581 "Build SLP failed: unrolling required "
582 "in basic block SLP\n");
583 /* Fatal mismatch. */
584 matches[0] = false;
585 return false;
586 }
587
588 /* In case of multiple types we need to detect the smallest type. */
589 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
590 {
591 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
592 if (bb_vinfo)
593 vectorization_factor = *max_nunits;
594 }
595
596 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
597 {
598 rhs_code = CALL_EXPR;
599 if (gimple_call_internal_p (call_stmt)
600 || gimple_call_tail_p (call_stmt)
601 || gimple_call_noreturn_p (call_stmt)
602 || !gimple_call_nothrow_p (call_stmt)
603 || gimple_call_chain (call_stmt))
604 {
605 if (dump_enabled_p ())
606 {
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: unsupported call type ");
609 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
610 call_stmt, 0);
611 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
612 }
613 /* Fatal mismatch. */
614 matches[0] = false;
615 return false;
616 }
617 }
618 else
619 rhs_code = gimple_assign_rhs_code (stmt);
620
621 /* Check the operation. */
622 if (i == 0)
623 {
624 first_stmt_code = rhs_code;
625
626 /* Shift arguments should be equal in all the packed stmts for a
627 vector shift with scalar shift operand. */
628 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
629 || rhs_code == LROTATE_EXPR
630 || rhs_code == RROTATE_EXPR)
631 {
632 vec_mode = TYPE_MODE (vectype);
633
634 /* First see if we have a vector/vector shift. */
635 optab = optab_for_tree_code (rhs_code, vectype,
636 optab_vector);
637
638 if (!optab
639 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
640 {
641 /* No vector/vector shift, try for a vector/scalar shift. */
642 optab = optab_for_tree_code (rhs_code, vectype,
643 optab_scalar);
644
645 if (!optab)
646 {
647 if (dump_enabled_p ())
648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649 "Build SLP failed: no optab.\n");
650 /* Fatal mismatch. */
651 matches[0] = false;
652 return false;
653 }
654 icode = (int) optab_handler (optab, vec_mode);
655 if (icode == CODE_FOR_nothing)
656 {
657 if (dump_enabled_p ())
658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
659 "Build SLP failed: "
660 "op not supported by target.\n");
661 /* Fatal mismatch. */
662 matches[0] = false;
663 return false;
664 }
665 optab_op2_mode = insn_data[icode].operand[2].mode;
666 if (!VECTOR_MODE_P (optab_op2_mode))
667 {
668 need_same_oprnds = true;
669 first_op1 = gimple_assign_rhs2 (stmt);
670 }
671 }
672 }
673 else if (rhs_code == WIDEN_LSHIFT_EXPR)
674 {
675 need_same_oprnds = true;
676 first_op1 = gimple_assign_rhs2 (stmt);
677 }
678 }
679 else
680 {
681 if (first_stmt_code != rhs_code
682 && alt_stmt_code == ERROR_MARK)
683 alt_stmt_code = rhs_code;
684 if (first_stmt_code != rhs_code
685 && (first_stmt_code != IMAGPART_EXPR
686 || rhs_code != REALPART_EXPR)
687 && (first_stmt_code != REALPART_EXPR
688 || rhs_code != IMAGPART_EXPR)
689 /* Handle mismatches in plus/minus by computing both
690 and merging the results. */
691 && !((first_stmt_code == PLUS_EXPR
692 || first_stmt_code == MINUS_EXPR)
693 && (alt_stmt_code == PLUS_EXPR
694 || alt_stmt_code == MINUS_EXPR)
695 && rhs_code == alt_stmt_code)
696 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
697 && (first_stmt_code == ARRAY_REF
698 || first_stmt_code == BIT_FIELD_REF
699 || first_stmt_code == INDIRECT_REF
700 || first_stmt_code == COMPONENT_REF
701 || first_stmt_code == MEM_REF)))
702 {
703 if (dump_enabled_p ())
704 {
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "Build SLP failed: different operation "
707 "in stmt ");
708 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
710 "original stmt ");
711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
712 first_stmt, 0);
713 }
714 /* Mismatch. */
715 continue;
716 }
717
718 if (need_same_oprnds
719 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
720 {
721 if (dump_enabled_p ())
722 {
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
724 "Build SLP failed: different shift "
725 "arguments in ");
726 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
727 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
728 }
729 /* Mismatch. */
730 continue;
731 }
732
733 if (rhs_code == CALL_EXPR)
734 {
735 gimple first_stmt = stmts[0];
736 if (gimple_call_num_args (stmt) != nops
737 || !operand_equal_p (gimple_call_fn (first_stmt),
738 gimple_call_fn (stmt), 0)
739 || gimple_call_fntype (first_stmt)
740 != gimple_call_fntype (stmt))
741 {
742 if (dump_enabled_p ())
743 {
744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
745 "Build SLP failed: different calls in ");
746 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
747 stmt, 0);
748 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
749 }
750 /* Mismatch. */
751 continue;
752 }
753 }
754 }
755
756 /* Grouped store or load. */
757 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
758 {
759 if (REFERENCE_CLASS_P (lhs))
760 {
761 /* Store. */
762 ;
763 }
764 else
765 {
766 /* Load. */
767 unsigned unrolling_factor
768 = least_common_multiple
769 (*max_nunits, group_size) / group_size;
770 /* FORNOW: Check that there is no gap between the loads
771 and no gap between the groups when we need to load
772 multiple groups at once.
773 ??? We should enhance this to only disallow gaps
774 inside vectors. */
775 if ((unrolling_factor > 1
776 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
777 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
778 /* If the group is split up then GROUP_GAP
779 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
780 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
781 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
782 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
783 {
784 if (dump_enabled_p ())
785 {
786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
787 "Build SLP failed: grouped "
788 "loads have gaps ");
789 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
790 stmt, 0);
791 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
792 }
793 /* Fatal mismatch. */
794 matches[0] = false;
795 return false;
796 }
797
798 /* Check that the size of interleaved loads group is not
799 greater than the SLP group size. */
800 unsigned ncopies
801 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
802 if (loop_vinfo
803 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
804 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
805 - GROUP_GAP (vinfo_for_stmt (stmt)))
806 > ncopies * group_size))
807 {
808 if (dump_enabled_p ())
809 {
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "Build SLP failed: the number "
812 "of interleaved loads is greater than "
813 "the SLP group size ");
814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
815 stmt, 0);
816 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 }
818 /* Fatal mismatch. */
819 matches[0] = false;
820 return false;
821 }
822
823 old_first_load = first_load;
824 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
825 if (prev_first_load)
826 {
827 /* Check that there are no loads from different interleaving
828 chains in the same node. */
829 if (prev_first_load != first_load)
830 {
831 if (dump_enabled_p ())
832 {
833 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
834 vect_location,
835 "Build SLP failed: different "
836 "interleaving chains in one node ");
837 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
838 stmt, 0);
839 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
840 }
841 /* Mismatch. */
842 continue;
843 }
844 }
845 else
846 prev_first_load = first_load;
847
848 /* In some cases a group of loads is just the same load
849 repeated N times. Only analyze its cost once. */
850 if (first_load == stmt && old_first_load != first_load)
851 {
852 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
853 if (vect_supportable_dr_alignment (first_dr, false)
854 == dr_unaligned_unsupported)
855 {
856 if (dump_enabled_p ())
857 {
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
859 vect_location,
860 "Build SLP failed: unsupported "
861 "unaligned load ");
862 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
863 stmt, 0);
864 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
865 }
866 /* Fatal mismatch. */
867 matches[0] = false;
868 return false;
869 }
870 }
871 }
872 } /* Grouped access. */
873 else
874 {
875 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
876 {
877 /* Not grouped load. */
878 if (dump_enabled_p ())
879 {
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: not grouped load ");
882 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
883 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
884 }
885
886 /* FORNOW: Not grouped loads are not supported. */
887 /* Fatal mismatch. */
888 matches[0] = false;
889 return false;
890 }
891
892 /* Not memory operation. */
893 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
894 && TREE_CODE_CLASS (rhs_code) != tcc_unary
895 && rhs_code != COND_EXPR
896 && rhs_code != CALL_EXPR)
897 {
898 if (dump_enabled_p ())
899 {
900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
901 "Build SLP failed: operation");
902 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
903 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
904 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
905 }
906 /* Fatal mismatch. */
907 matches[0] = false;
908 return false;
909 }
910
911 if (rhs_code == COND_EXPR)
912 {
913 tree cond_expr = gimple_assign_rhs1 (stmt);
914
915 if (i == 0)
916 first_cond_code = TREE_CODE (cond_expr);
917 else if (first_cond_code != TREE_CODE (cond_expr))
918 {
919 if (dump_enabled_p ())
920 {
921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
922 "Build SLP failed: different"
923 " operation");
924 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
925 stmt, 0);
926 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
927 }
928 /* Mismatch. */
929 continue;
930 }
931 }
932 }
933
934 matches[i] = true;
935 }
936
937 for (i = 0; i < group_size; ++i)
938 if (!matches[i])
939 return false;
940
941 /* If we allowed a two-operation SLP node verify the target can cope
942 with the permute we are going to use. */
943 if (alt_stmt_code != ERROR_MARK
944 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
945 {
946 unsigned char *sel
947 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
948 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
949 {
950 sel[i] = i;
951 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
952 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
953 }
954 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
955 {
956 for (i = 0; i < group_size; ++i)
957 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
958 {
959 matches[i] = false;
960 if (dump_enabled_p ())
961 {
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "Build SLP failed: different operation "
964 "in stmt ");
965 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
966 stmts[i], 0);
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "original stmt ");
969 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
970 first_stmt, 0);
971 }
972 }
973 return false;
974 }
975 *two_operators = true;
976 }
977
978 return true;
979 }
980
981 /* Recursively build an SLP tree starting from NODE.
982 Fail (and return a value not equal to zero) if def-stmts are not
983 isomorphic, require data permutation or are of unsupported types of
984 operation. Otherwise, return 0.
985 The value returned is the depth in the SLP tree where a mismatch
986 was found. */
987
988 static bool
989 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
990 slp_tree *node, unsigned int group_size,
991 unsigned int *max_nunits,
992 vec<slp_tree> *loads,
993 unsigned int vectorization_factor,
994 bool *matches, unsigned *npermutes, unsigned *tree_size,
995 unsigned max_tree_size)
996 {
997 unsigned nops, i, this_tree_size = 0;
998 gimple stmt;
999
1000 matches[0] = false;
1001
1002 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1003 if (is_gimple_call (stmt))
1004 nops = gimple_call_num_args (stmt);
1005 else if (is_gimple_assign (stmt))
1006 {
1007 nops = gimple_num_ops (stmt) - 1;
1008 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1009 nops++;
1010 }
1011 else
1012 return false;
1013
1014 bool two_operators = false;
1015 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
1016 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
1017 max_nunits, vectorization_factor, matches,
1018 &two_operators))
1019 return false;
1020 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
1021
1022 /* If the SLP node is a load, terminate the recursion. */
1023 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
1024 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1025 {
1026 loads->safe_push (*node);
1027 return true;
1028 }
1029
1030 /* Get at the operands, verifying they are compatible. */
1031 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1032 slp_oprnd_info oprnd_info;
1033 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
1034 {
1035 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
1036 stmt, (i == 0), &oprnds_info))
1037 {
1038 case 0:
1039 break;
1040 case -1:
1041 matches[0] = false;
1042 vect_free_oprnd_info (oprnds_info);
1043 return false;
1044 case 1:
1045 matches[i] = false;
1046 break;
1047 }
1048 }
1049 for (i = 0; i < group_size; ++i)
1050 if (!matches[i])
1051 {
1052 vect_free_oprnd_info (oprnds_info);
1053 return false;
1054 }
1055
1056 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
1057
1058 /* Create SLP_TREE nodes for the definition node/s. */
1059 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1060 {
1061 slp_tree child;
1062 unsigned old_nloads = loads->length ();
1063 unsigned old_max_nunits = *max_nunits;
1064
1065 if (oprnd_info->first_dt != vect_internal_def)
1066 continue;
1067
1068 if (++this_tree_size > max_tree_size)
1069 {
1070 vect_free_oprnd_info (oprnds_info);
1071 return false;
1072 }
1073
1074 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1075 if (!child)
1076 {
1077 vect_free_oprnd_info (oprnds_info);
1078 return false;
1079 }
1080
1081 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1082 group_size, max_nunits, loads,
1083 vectorization_factor, matches,
1084 npermutes, &this_tree_size, max_tree_size))
1085 {
1086 oprnd_info->def_stmts = vNULL;
1087 SLP_TREE_CHILDREN (*node).quick_push (child);
1088 continue;
1089 }
1090
1091 /* If the SLP build failed fatally and we analyze a basic-block
1092 simply treat nodes we fail to build as externally defined
1093 (and thus build vectors from the scalar defs).
1094 The cost model will reject outright expensive cases.
1095 ??? This doesn't treat cases where permutation ultimatively
1096 fails (or we don't try permutation below). Ideally we'd
1097 even compute a permutation that will end up with the maximum
1098 SLP tree size... */
1099 if (bb_vinfo
1100 && !matches[0]
1101 /* ??? Rejecting patterns this way doesn't work. We'd have to
1102 do extra work to cancel the pattern so the uses see the
1103 scalar version. */
1104 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1105 {
1106 dump_printf_loc (MSG_NOTE, vect_location,
1107 "Building vector operands from scalars\n");
1108 oprnd_info->def_stmts = vNULL;
1109 vect_free_slp_tree (child);
1110 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1111 continue;
1112 }
1113
1114 /* If the SLP build for operand zero failed and operand zero
1115 and one can be commutated try that for the scalar stmts
1116 that failed the match. */
1117 if (i == 0
1118 /* A first scalar stmt mismatch signals a fatal mismatch. */
1119 && matches[0]
1120 /* ??? For COND_EXPRs we can swap the comparison operands
1121 as well as the arms under some constraints. */
1122 && nops == 2
1123 && oprnds_info[1]->first_dt == vect_internal_def
1124 && is_gimple_assign (stmt)
1125 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1126 /* Do so only if the number of not successful permutes was nor more
1127 than a cut-ff as re-trying the recursive match on
1128 possibly each level of the tree would expose exponential
1129 behavior. */
1130 && *npermutes < 4)
1131 {
1132 unsigned int j;
1133 slp_tree grandchild;
1134
1135 /* Roll back. */
1136 *max_nunits = old_max_nunits;
1137 loads->truncate (old_nloads);
1138 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1139 vect_free_slp_tree (grandchild);
1140 SLP_TREE_CHILDREN (child).truncate (0);
1141
1142 /* Swap mismatched definition stmts. */
1143 dump_printf_loc (MSG_NOTE, vect_location,
1144 "Re-trying with swapped operands of stmts ");
1145 for (j = 0; j < group_size; ++j)
1146 if (!matches[j])
1147 {
1148 gimple tem = oprnds_info[0]->def_stmts[j];
1149 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1150 oprnds_info[1]->def_stmts[j] = tem;
1151 dump_printf (MSG_NOTE, "%d ", j);
1152 }
1153 dump_printf (MSG_NOTE, "\n");
1154 /* And try again with scratch 'matches' ... */
1155 bool *tem = XALLOCAVEC (bool, group_size);
1156 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1157 group_size, max_nunits, loads,
1158 vectorization_factor,
1159 tem, npermutes, &this_tree_size,
1160 max_tree_size))
1161 {
1162 /* ... so if successful we can apply the operand swapping
1163 to the GIMPLE IL. This is necessary because for example
1164 vect_get_slp_defs uses operand indexes and thus expects
1165 canonical operand order. */
1166 for (j = 0; j < group_size; ++j)
1167 if (!matches[j])
1168 {
1169 gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1170 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1171 gimple_assign_rhs2_ptr (stmt));
1172 }
1173 oprnd_info->def_stmts = vNULL;
1174 SLP_TREE_CHILDREN (*node).quick_push (child);
1175 continue;
1176 }
1177
1178 ++*npermutes;
1179 }
1180
1181 oprnd_info->def_stmts = vNULL;
1182 vect_free_slp_tree (child);
1183 vect_free_oprnd_info (oprnds_info);
1184 return false;
1185 }
1186
1187 if (tree_size)
1188 *tree_size += this_tree_size;
1189
1190 vect_free_oprnd_info (oprnds_info);
1191 return true;
1192 }
1193
1194 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1195
1196 static void
1197 vect_print_slp_tree (int dump_kind, slp_tree node)
1198 {
1199 int i;
1200 gimple stmt;
1201 slp_tree child;
1202
1203 if (!node)
1204 return;
1205
1206 dump_printf (dump_kind, "node ");
1207 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1208 {
1209 dump_printf (dump_kind, "\n\tstmt %d ", i);
1210 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1211 }
1212 dump_printf (dump_kind, "\n");
1213
1214 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1215 vect_print_slp_tree (dump_kind, child);
1216 }
1217
1218
1219 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1220 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1221 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1222 stmts in NODE are to be marked. */
1223
1224 static void
1225 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1226 {
1227 int i;
1228 gimple stmt;
1229 slp_tree child;
1230
1231 if (!node)
1232 return;
1233
1234 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1235 if (j < 0 || i == j)
1236 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1237
1238 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1239 vect_mark_slp_stmts (child, mark, j);
1240 }
1241
1242
1243 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1244
1245 static void
1246 vect_mark_slp_stmts_relevant (slp_tree node)
1247 {
1248 int i;
1249 gimple stmt;
1250 stmt_vec_info stmt_info;
1251 slp_tree child;
1252
1253 if (!node)
1254 return;
1255
1256 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1257 {
1258 stmt_info = vinfo_for_stmt (stmt);
1259 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1260 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1261 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1262 }
1263
1264 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1265 vect_mark_slp_stmts_relevant (child);
1266 }
1267
1268
1269 /* Rearrange the statements of NODE according to PERMUTATION. */
1270
1271 static void
1272 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1273 vec<unsigned> permutation)
1274 {
1275 gimple stmt;
1276 vec<gimple> tmp_stmts;
1277 unsigned int i;
1278 slp_tree child;
1279
1280 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1281 vect_slp_rearrange_stmts (child, group_size, permutation);
1282
1283 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1284 tmp_stmts.create (group_size);
1285 tmp_stmts.quick_grow_cleared (group_size);
1286
1287 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1288 tmp_stmts[permutation[i]] = stmt;
1289
1290 SLP_TREE_SCALAR_STMTS (node).release ();
1291 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1292 }
1293
1294
1295 /* Check if the required load permutations in the SLP instance
1296 SLP_INSTN are supported. */
1297
1298 static bool
1299 vect_supported_load_permutation_p (slp_instance slp_instn)
1300 {
1301 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1302 unsigned int i, j, k, next;
1303 sbitmap load_index;
1304 slp_tree node;
1305 gimple stmt, load, next_load, first_load;
1306 struct data_reference *dr;
1307
1308 if (dump_enabled_p ())
1309 {
1310 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1311 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1312 if (node->load_permutation.exists ())
1313 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1314 dump_printf (MSG_NOTE, "%d ", next);
1315 else
1316 for (k = 0; k < group_size; ++k)
1317 dump_printf (MSG_NOTE, "%d ", k);
1318 dump_printf (MSG_NOTE, "\n");
1319 }
1320
1321 /* In case of reduction every load permutation is allowed, since the order
1322 of the reduction statements is not important (as opposed to the case of
1323 grouped stores). The only condition we need to check is that all the
1324 load nodes are of the same size and have the same permutation (and then
1325 rearrange all the nodes of the SLP instance according to this
1326 permutation). */
1327
1328 /* Check that all the load nodes are of the same size. */
1329 /* ??? Can't we assert this? */
1330 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1331 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1332 return false;
1333
1334 node = SLP_INSTANCE_TREE (slp_instn);
1335 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1336
1337 /* Reduction (there are no data-refs in the root).
1338 In reduction chain the order of the loads is important. */
1339 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1340 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1341 {
1342 slp_tree load;
1343 unsigned int lidx;
1344
1345 /* Compare all the permutation sequences to the first one. We know
1346 that at least one load is permuted. */
1347 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1348 if (!node->load_permutation.exists ())
1349 return false;
1350 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1351 {
1352 if (!load->load_permutation.exists ())
1353 return false;
1354 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1355 if (lidx != node->load_permutation[j])
1356 return false;
1357 }
1358
1359 /* Check that the loads in the first sequence are different and there
1360 are no gaps between them. */
1361 load_index = sbitmap_alloc (group_size);
1362 bitmap_clear (load_index);
1363 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1364 {
1365 if (bitmap_bit_p (load_index, lidx))
1366 {
1367 sbitmap_free (load_index);
1368 return false;
1369 }
1370 bitmap_set_bit (load_index, lidx);
1371 }
1372 for (i = 0; i < group_size; i++)
1373 if (!bitmap_bit_p (load_index, i))
1374 {
1375 sbitmap_free (load_index);
1376 return false;
1377 }
1378 sbitmap_free (load_index);
1379
1380 /* This permutation is valid for reduction. Since the order of the
1381 statements in the nodes is not important unless they are memory
1382 accesses, we can rearrange the statements in all the nodes
1383 according to the order of the loads. */
1384 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1385 node->load_permutation);
1386
1387 /* We are done, no actual permutations need to be generated. */
1388 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1389 SLP_TREE_LOAD_PERMUTATION (node).release ();
1390 return true;
1391 }
1392
1393 /* In basic block vectorization we allow any subchain of an interleaving
1394 chain.
1395 FORNOW: not supported in loop SLP because of realignment compications. */
1396 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1397 {
1398 /* Check whether the loads in an instance form a subchain and thus
1399 no permutation is necessary. */
1400 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1401 {
1402 bool subchain_p = true;
1403 next_load = NULL;
1404 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1405 {
1406 if (j != 0 && next_load != load)
1407 {
1408 subchain_p = false;
1409 break;
1410 }
1411 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1412 }
1413 if (subchain_p)
1414 SLP_TREE_LOAD_PERMUTATION (node).release ();
1415 else
1416 {
1417 /* Verify the permutation can be generated. */
1418 vec<tree> tem;
1419 if (!vect_transform_slp_perm_load (node, tem, NULL,
1420 1, slp_instn, true))
1421 {
1422 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1423 vect_location,
1424 "unsupported load permutation\n");
1425 return false;
1426 }
1427 }
1428 }
1429
1430 /* Check that the alignment of the first load in every subchain, i.e.,
1431 the first statement in every load node, is supported.
1432 ??? This belongs in alignment checking. */
1433 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1434 {
1435 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1436 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1437 {
1438 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1439 if (vect_supportable_dr_alignment (dr, false)
1440 == dr_unaligned_unsupported)
1441 {
1442 if (dump_enabled_p ())
1443 {
1444 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1445 vect_location,
1446 "unsupported unaligned load ");
1447 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1448 first_load, 0);
1449 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1450 }
1451 return false;
1452 }
1453 }
1454 }
1455
1456 return true;
1457 }
1458
1459 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1460 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1461 well (unless it's reduction). */
1462 if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1463 return false;
1464 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1465 if (!node->load_permutation.exists ())
1466 return false;
1467
1468 load_index = sbitmap_alloc (group_size);
1469 bitmap_clear (load_index);
1470 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1471 {
1472 unsigned int lidx = node->load_permutation[0];
1473 if (bitmap_bit_p (load_index, lidx))
1474 {
1475 sbitmap_free (load_index);
1476 return false;
1477 }
1478 bitmap_set_bit (load_index, lidx);
1479 FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1480 if (k != lidx)
1481 {
1482 sbitmap_free (load_index);
1483 return false;
1484 }
1485 }
1486 for (i = 0; i < group_size; i++)
1487 if (!bitmap_bit_p (load_index, i))
1488 {
1489 sbitmap_free (load_index);
1490 return false;
1491 }
1492 sbitmap_free (load_index);
1493
1494 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1495 if (node->load_permutation.exists ()
1496 && !vect_transform_slp_perm_load
1497 (node, vNULL, NULL,
1498 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1499 return false;
1500 return true;
1501 }
1502
1503
1504 /* Find the last store in SLP INSTANCE. */
1505
1506 static gimple
1507 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1508 {
1509 gimple last = NULL, stmt;
1510
1511 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1512 {
1513 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1514 if (is_pattern_stmt_p (stmt_vinfo))
1515 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1516 else
1517 last = get_later_stmt (stmt, last);
1518 }
1519
1520 return last;
1521 }
1522
1523 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1524
1525 static void
1526 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1527 slp_instance instance, slp_tree node,
1528 stmt_vector_for_cost *prologue_cost_vec,
1529 unsigned ncopies_for_cost)
1530 {
1531 stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1532
1533 unsigned i;
1534 slp_tree child;
1535 gimple stmt, s;
1536 stmt_vec_info stmt_info;
1537 tree lhs;
1538 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1539
1540 /* Recurse down the SLP tree. */
1541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1542 if (child)
1543 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1544 instance, child, prologue_cost_vec,
1545 ncopies_for_cost);
1546
1547 /* Look at the first scalar stmt to determine the cost. */
1548 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1549 stmt_info = vinfo_for_stmt (stmt);
1550 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1551 {
1552 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1553 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1554 vect_uninitialized_def,
1555 node, prologue_cost_vec, body_cost_vec);
1556 else
1557 {
1558 int i;
1559 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1560 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1561 node, prologue_cost_vec, body_cost_vec);
1562 /* If the load is permuted record the cost for the permutation.
1563 ??? Loads from multiple chains are let through here only
1564 for a single special case involving complex numbers where
1565 in the end no permutation is necessary. */
1566 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1567 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1568 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1569 && vect_get_place_in_interleaving_chain
1570 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1571 {
1572 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1573 stmt_info, 0, vect_body);
1574 break;
1575 }
1576 }
1577 }
1578 else
1579 {
1580 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1581 stmt_info, 0, vect_body);
1582 if (SLP_TREE_TWO_OPERATORS (node))
1583 {
1584 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1585 stmt_info, 0, vect_body);
1586 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1587 stmt_info, 0, vect_body);
1588 }
1589 }
1590
1591 /* Scan operands and account for prologue cost of constants/externals.
1592 ??? This over-estimates cost for multiple uses and should be
1593 re-engineered. */
1594 lhs = gimple_get_lhs (stmt);
1595 for (i = 0; i < gimple_num_ops (stmt); ++i)
1596 {
1597 tree def, op = gimple_op (stmt, i);
1598 gimple def_stmt;
1599 enum vect_def_type dt;
1600 if (!op || op == lhs)
1601 continue;
1602 if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1603 &def_stmt, &def, &dt))
1604 {
1605 /* Without looking at the actual initializer a vector of
1606 constants can be implemented as load from the constant pool.
1607 ??? We need to pass down stmt_info for a vector type
1608 even if it points to the wrong stmt. */
1609 if (dt == vect_constant_def)
1610 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1611 stmt_info, 0, vect_prologue);
1612 else if (dt == vect_external_def)
1613 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1614 stmt_info, 0, vect_prologue);
1615 }
1616 }
1617 }
1618
1619 /* Compute the cost for the SLP instance INSTANCE. */
1620
1621 static void
1622 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1623 slp_instance instance, unsigned nunits)
1624 {
1625 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1626 unsigned ncopies_for_cost;
1627 stmt_info_for_cost *si;
1628 unsigned i;
1629
1630 /* Calculate the number of vector stmts to create based on the unrolling
1631 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1632 GROUP_SIZE / NUNITS otherwise. */
1633 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1634 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1635
1636 prologue_cost_vec.create (10);
1637 body_cost_vec.create (10);
1638 SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1639 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1640 instance, SLP_INSTANCE_TREE (instance),
1641 &prologue_cost_vec, ncopies_for_cost);
1642
1643 /* Record the prologue costs, which were delayed until we were
1644 sure that SLP was successful. Unlike the body costs, we know
1645 the final values now regardless of the loop vectorization factor. */
1646 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1647 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1648 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1649 {
1650 struct _stmt_vec_info *stmt_info
1651 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1652 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1653 si->misalign, vect_prologue);
1654 }
1655
1656 prologue_cost_vec.release ();
1657 }
1658
1659 /* Analyze an SLP instance starting from a group of grouped stores. Call
1660 vect_build_slp_tree to build a tree of packed stmts if possible.
1661 Return FALSE if it's impossible to SLP any stmt in the loop. */
1662
1663 static bool
1664 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1665 gimple stmt, unsigned max_tree_size)
1666 {
1667 slp_instance new_instance;
1668 slp_tree node;
1669 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1670 unsigned int unrolling_factor = 1, nunits;
1671 tree vectype, scalar_type = NULL_TREE;
1672 gimple next;
1673 unsigned int vectorization_factor = 0;
1674 int i;
1675 unsigned int max_nunits = 0;
1676 vec<slp_tree> loads;
1677 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1678 vec<gimple> scalar_stmts;
1679
1680 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1681 {
1682 if (dr)
1683 {
1684 scalar_type = TREE_TYPE (DR_REF (dr));
1685 vectype = get_vectype_for_scalar_type (scalar_type);
1686 }
1687 else
1688 {
1689 gcc_assert (loop_vinfo);
1690 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1691 }
1692
1693 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1694 }
1695 else
1696 {
1697 gcc_assert (loop_vinfo);
1698 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1699 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1700 }
1701
1702 if (!vectype)
1703 {
1704 if (dump_enabled_p ())
1705 {
1706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707 "Build SLP failed: unsupported data-type ");
1708 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1710 }
1711
1712 return false;
1713 }
1714
1715 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1716 if (loop_vinfo)
1717 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1718 else
1719 vectorization_factor = nunits;
1720
1721 /* Calculate the unrolling factor. */
1722 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1723 if (unrolling_factor != 1 && !loop_vinfo)
1724 {
1725 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727 "Build SLP failed: unrolling required in basic"
1728 " block SLP\n");
1729
1730 return false;
1731 }
1732
1733 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1734 scalar_stmts.create (group_size);
1735 next = stmt;
1736 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1737 {
1738 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1739 while (next)
1740 {
1741 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1742 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1743 scalar_stmts.safe_push (
1744 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1745 else
1746 scalar_stmts.safe_push (next);
1747 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1748 }
1749 }
1750 else
1751 {
1752 /* Collect reduction statements. */
1753 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1754 for (i = 0; reductions.iterate (i, &next); i++)
1755 scalar_stmts.safe_push (next);
1756 }
1757
1758 node = vect_create_new_slp_node (scalar_stmts);
1759
1760 loads.create (group_size);
1761
1762 /* Build the tree for the SLP instance. */
1763 bool *matches = XALLOCAVEC (bool, group_size);
1764 unsigned npermutes = 0;
1765 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1766 &max_nunits, &loads,
1767 vectorization_factor, matches, &npermutes, NULL,
1768 max_tree_size))
1769 {
1770 /* Calculate the unrolling factor based on the smallest type. */
1771 if (max_nunits > nunits)
1772 unrolling_factor = least_common_multiple (max_nunits, group_size)
1773 / group_size;
1774
1775 if (unrolling_factor != 1 && !loop_vinfo)
1776 {
1777 if (dump_enabled_p ())
1778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1779 "Build SLP failed: unrolling required in basic"
1780 " block SLP\n");
1781 vect_free_slp_tree (node);
1782 loads.release ();
1783 return false;
1784 }
1785
1786 /* Create a new SLP instance. */
1787 new_instance = XNEW (struct _slp_instance);
1788 SLP_INSTANCE_TREE (new_instance) = node;
1789 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1790 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1791 SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1792 SLP_INSTANCE_LOADS (new_instance) = loads;
1793
1794 /* Compute the load permutation. */
1795 slp_tree load_node;
1796 bool loads_permuted = false;
1797 FOR_EACH_VEC_ELT (loads, i, load_node)
1798 {
1799 vec<unsigned> load_permutation;
1800 int j;
1801 gimple load, first_stmt;
1802 bool this_load_permuted = false;
1803 load_permutation.create (group_size);
1804 first_stmt = GROUP_FIRST_ELEMENT
1805 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1806 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1807 {
1808 int load_place
1809 = vect_get_place_in_interleaving_chain (load, first_stmt);
1810 gcc_assert (load_place != -1);
1811 if (load_place != j)
1812 this_load_permuted = true;
1813 load_permutation.safe_push (load_place);
1814 }
1815 if (!this_load_permuted)
1816 {
1817 load_permutation.release ();
1818 continue;
1819 }
1820 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1821 loads_permuted = true;
1822 }
1823
1824 if (loads_permuted)
1825 {
1826 if (!vect_supported_load_permutation_p (new_instance))
1827 {
1828 if (dump_enabled_p ())
1829 {
1830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831 "Build SLP failed: unsupported load "
1832 "permutation ");
1833 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1834 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1835 }
1836 vect_free_slp_instance (new_instance);
1837 return false;
1838 }
1839 }
1840
1841
1842 if (loop_vinfo)
1843 {
1844 /* Compute the costs of this SLP instance. Delay this for BB
1845 vectorization as we don't have vector types computed yet. */
1846 vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1847 new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1848 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1849 }
1850 else
1851 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1852
1853 if (dump_enabled_p ())
1854 vect_print_slp_tree (MSG_NOTE, node);
1855
1856 return true;
1857 }
1858
1859 /* Failed to SLP. */
1860 /* Free the allocated memory. */
1861 vect_free_slp_tree (node);
1862 loads.release ();
1863
1864 return false;
1865 }
1866
1867
1868 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1869 trees of packed scalar stmts if SLP is possible. */
1870
1871 bool
1872 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1873 unsigned max_tree_size)
1874 {
1875 unsigned int i;
1876 vec<gimple> grouped_stores;
1877 vec<gimple> reductions = vNULL;
1878 vec<gimple> reduc_chains = vNULL;
1879 gimple first_element;
1880 bool ok = false;
1881
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1884
1885 if (loop_vinfo)
1886 {
1887 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1888 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1889 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1890 }
1891 else
1892 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1893
1894 /* Find SLP sequences starting from groups of grouped stores. */
1895 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1896 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1897 max_tree_size))
1898 ok = true;
1899
1900 if (bb_vinfo && !ok)
1901 {
1902 if (dump_enabled_p ())
1903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904 "Failed to SLP the basic block.\n");
1905
1906 return false;
1907 }
1908
1909 if (loop_vinfo
1910 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1911 {
1912 /* Find SLP sequences starting from reduction chains. */
1913 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1914 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1915 max_tree_size))
1916 ok = true;
1917 else
1918 return false;
1919
1920 /* Don't try to vectorize SLP reductions if reduction chain was
1921 detected. */
1922 return ok;
1923 }
1924
1925 /* Find SLP sequences starting from groups of reductions. */
1926 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1927 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1928 max_tree_size))
1929 ok = true;
1930
1931 return true;
1932 }
1933
1934
1935 /* For each possible SLP instance decide whether to SLP it and calculate overall
1936 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1937 least one instance. */
1938
1939 bool
1940 vect_make_slp_decision (loop_vec_info loop_vinfo)
1941 {
1942 unsigned int i, unrolling_factor = 1;
1943 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1944 slp_instance instance;
1945 int decided_to_slp = 0;
1946
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1949 "\n");
1950
1951 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1952 {
1953 /* FORNOW: SLP if you can. */
1954 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1955 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1956
1957 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1958 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1959 loop-based vectorization. Such stmts will be marked as HYBRID. */
1960 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1961 decided_to_slp++;
1962 }
1963
1964 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1965
1966 if (decided_to_slp && dump_enabled_p ())
1967 dump_printf_loc (MSG_NOTE, vect_location,
1968 "Decided to SLP %d instances. Unrolling factor %d\n",
1969 decided_to_slp, unrolling_factor);
1970
1971 return (decided_to_slp > 0);
1972 }
1973
1974
1975 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1976 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1977
1978 static void
1979 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1980 {
1981 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1982 imm_use_iterator imm_iter;
1983 gimple use_stmt;
1984 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1985 slp_tree child;
1986 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1987 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1988 int j;
1989
1990 /* Propagate hybrid down the SLP tree. */
1991 if (stype == hybrid)
1992 ;
1993 else if (HYBRID_SLP_STMT (stmt_vinfo))
1994 stype = hybrid;
1995 else
1996 {
1997 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1998 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1999 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
2000 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
2001 if (gimple_bb (use_stmt)
2002 && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2003 && (use_vinfo = vinfo_for_stmt (use_stmt))
2004 && !STMT_SLP_TYPE (use_vinfo)
2005 && (STMT_VINFO_RELEVANT (use_vinfo)
2006 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
2007 || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
2008 && STMT_VINFO_RELATED_STMT (use_vinfo)
2009 && !STMT_SLP_TYPE (vinfo_for_stmt
2010 (STMT_VINFO_RELATED_STMT (use_vinfo)))))
2011 && !(gimple_code (use_stmt) == GIMPLE_PHI
2012 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2013 stype = hybrid;
2014 }
2015
2016 if (stype == hybrid)
2017 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2018
2019 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2020 if (child)
2021 vect_detect_hybrid_slp_stmts (child, i, stype);
2022 }
2023
2024 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2025
2026 static tree
2027 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2028 {
2029 walk_stmt_info *wi = (walk_stmt_info *)data;
2030 struct loop *loopp = (struct loop *)wi->info;
2031
2032 if (wi->is_lhs)
2033 return NULL_TREE;
2034
2035 if (TREE_CODE (*tp) == SSA_NAME
2036 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2037 {
2038 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
2039 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2040 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2041 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2042 }
2043
2044 return NULL_TREE;
2045 }
2046
2047 static tree
2048 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2049 walk_stmt_info *)
2050 {
2051 /* If the stmt is in a SLP instance then this isn't a reason
2052 to mark use definitions in other SLP instances as hybrid. */
2053 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2054 *handled = true;
2055 return NULL_TREE;
2056 }
2057
2058 /* Find stmts that must be both vectorized and SLPed. */
2059
2060 void
2061 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2062 {
2063 unsigned int i;
2064 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2065 slp_instance instance;
2066
2067 if (dump_enabled_p ())
2068 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2069 "\n");
2070
2071 /* First walk all pattern stmt in the loop and mark defs of uses as
2072 hybrid because immediate uses in them are not recorded. */
2073 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2074 {
2075 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2076 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2077 gsi_next (&gsi))
2078 {
2079 gimple stmt = gsi_stmt (gsi);
2080 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2081 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2082 {
2083 walk_stmt_info wi;
2084 memset (&wi, 0, sizeof (wi));
2085 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2086 gimple_stmt_iterator gsi2
2087 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2088 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2089 vect_detect_hybrid_slp_1, &wi);
2090 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2091 vect_detect_hybrid_slp_2,
2092 vect_detect_hybrid_slp_1, &wi);
2093 }
2094 }
2095 }
2096
2097 /* Then walk the SLP instance trees marking stmts with uses in
2098 non-SLP stmts as hybrid, also propagating hybrid down the
2099 SLP tree, collecting the above info on-the-fly. */
2100 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2101 {
2102 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2103 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2104 i, pure_slp);
2105 }
2106 }
2107
2108
2109 /* Create and initialize a new bb_vec_info struct for BB, as well as
2110 stmt_vec_info structs for all the stmts in it. */
2111
2112 static bb_vec_info
2113 new_bb_vec_info (basic_block bb)
2114 {
2115 bb_vec_info res = NULL;
2116 gimple_stmt_iterator gsi;
2117
2118 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2119 BB_VINFO_BB (res) = bb;
2120
2121 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2122 {
2123 gimple stmt = gsi_stmt (gsi);
2124 gimple_set_uid (stmt, 0);
2125 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2126 }
2127
2128 BB_VINFO_GROUPED_STORES (res).create (10);
2129 BB_VINFO_SLP_INSTANCES (res).create (2);
2130 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2131
2132 bb->aux = res;
2133 return res;
2134 }
2135
2136
2137 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2138 stmts in the basic block. */
2139
2140 static void
2141 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2142 {
2143 vec<slp_instance> slp_instances;
2144 slp_instance instance;
2145 basic_block bb;
2146 gimple_stmt_iterator si;
2147 unsigned i;
2148
2149 if (!bb_vinfo)
2150 return;
2151
2152 bb = BB_VINFO_BB (bb_vinfo);
2153
2154 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2155 {
2156 gimple stmt = gsi_stmt (si);
2157 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2158
2159 if (stmt_info)
2160 /* Free stmt_vec_info. */
2161 free_stmt_vec_info (stmt);
2162 }
2163
2164 vect_destroy_datarefs (NULL, bb_vinfo);
2165 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2166 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2167 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2168 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2169 vect_free_slp_instance (instance);
2170 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2171 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2172 free (bb_vinfo);
2173 bb->aux = NULL;
2174 }
2175
2176
2177 /* Analyze statements contained in SLP tree node after recursively analyzing
2178 the subtree. Return TRUE if the operations are supported. */
2179
2180 static bool
2181 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2182 {
2183 bool dummy;
2184 int i;
2185 gimple stmt;
2186 slp_tree child;
2187
2188 if (!node)
2189 return true;
2190
2191 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2192 if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2193 return false;
2194
2195 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2196 {
2197 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2198 gcc_assert (stmt_info);
2199 gcc_assert (PURE_SLP_STMT (stmt_info));
2200
2201 if (!vect_analyze_stmt (stmt, &dummy, node))
2202 return false;
2203 }
2204
2205 return true;
2206 }
2207
2208
2209 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2210 operations are supported. */
2211
2212 static bool
2213 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2214 {
2215 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2216 slp_instance instance;
2217 int i;
2218
2219 for (i = 0; slp_instances.iterate (i, &instance); )
2220 {
2221 if (!vect_slp_analyze_node_operations (bb_vinfo,
2222 SLP_INSTANCE_TREE (instance)))
2223 {
2224 vect_free_slp_instance (instance);
2225 slp_instances.ordered_remove (i);
2226 }
2227 else
2228 i++;
2229 }
2230
2231 if (!slp_instances.length ())
2232 return false;
2233
2234 return true;
2235 }
2236
2237
2238 /* Compute the scalar cost of the SLP node NODE and its children
2239 and return it. Do not account defs that are marked in LIFE and
2240 update LIFE according to uses of NODE. */
2241
2242 static unsigned
2243 vect_bb_slp_scalar_cost (basic_block bb,
2244 slp_tree node, vec<bool, va_heap> *life)
2245 {
2246 unsigned scalar_cost = 0;
2247 unsigned i;
2248 gimple stmt;
2249 slp_tree child;
2250
2251 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2252 {
2253 unsigned stmt_cost;
2254 ssa_op_iter op_iter;
2255 def_operand_p def_p;
2256 stmt_vec_info stmt_info;
2257
2258 if ((*life)[i])
2259 continue;
2260
2261 /* If there is a non-vectorized use of the defs then the scalar
2262 stmt is kept live in which case we do not account it or any
2263 required defs in the SLP children in the scalar cost. This
2264 way we make the vectorization more costly when compared to
2265 the scalar cost. */
2266 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2267 {
2268 imm_use_iterator use_iter;
2269 gimple use_stmt;
2270 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2271 if (!is_gimple_debug (use_stmt)
2272 && (gimple_code (use_stmt) == GIMPLE_PHI
2273 || gimple_bb (use_stmt) != bb
2274 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2275 {
2276 (*life)[i] = true;
2277 BREAK_FROM_IMM_USE_STMT (use_iter);
2278 }
2279 }
2280 if ((*life)[i])
2281 continue;
2282
2283 stmt_info = vinfo_for_stmt (stmt);
2284 if (STMT_VINFO_DATA_REF (stmt_info))
2285 {
2286 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2287 stmt_cost = vect_get_stmt_cost (scalar_load);
2288 else
2289 stmt_cost = vect_get_stmt_cost (scalar_store);
2290 }
2291 else
2292 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2293
2294 scalar_cost += stmt_cost;
2295 }
2296
2297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2298 if (child)
2299 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2300
2301 return scalar_cost;
2302 }
2303
2304 /* Check if vectorization of the basic block is profitable. */
2305
2306 static bool
2307 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2308 {
2309 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2310 slp_instance instance;
2311 int i, j;
2312 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2313 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2314 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2315 stmt_vec_info stmt_info = NULL;
2316 stmt_vector_for_cost body_cost_vec;
2317 stmt_info_for_cost *ci;
2318
2319 /* Calculate vector costs. */
2320 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2321 {
2322 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2323
2324 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2325 {
2326 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2327 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2328 stmt_info, ci->misalign, vect_body);
2329 }
2330 }
2331
2332 /* Calculate scalar cost. */
2333 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2334 {
2335 auto_vec<bool, 20> life;
2336 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2337 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2338 SLP_INSTANCE_TREE (instance),
2339 &life);
2340 }
2341
2342 /* Complete the target-specific cost calculation. */
2343 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2344 &vec_inside_cost, &vec_epilogue_cost);
2345
2346 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2347
2348 if (dump_enabled_p ())
2349 {
2350 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2351 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2352 vec_inside_cost);
2353 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2354 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2355 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2356 }
2357
2358 /* Vectorization is profitable if its cost is less than the cost of scalar
2359 version. */
2360 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2361 return false;
2362
2363 return true;
2364 }
2365
2366 /* Check if the basic block can be vectorized. */
2367
2368 static bb_vec_info
2369 vect_slp_analyze_bb_1 (basic_block bb)
2370 {
2371 bb_vec_info bb_vinfo;
2372 vec<slp_instance> slp_instances;
2373 slp_instance instance;
2374 int i;
2375 int min_vf = 2;
2376 unsigned n_stmts = 0;
2377
2378 bb_vinfo = new_bb_vec_info (bb);
2379 if (!bb_vinfo)
2380 return NULL;
2381
2382 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2383 {
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386 "not vectorized: unhandled data-ref in basic "
2387 "block.\n");
2388
2389 destroy_bb_vec_info (bb_vinfo);
2390 return NULL;
2391 }
2392
2393 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2394 {
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "not vectorized: not enough data-refs in "
2398 "basic block.\n");
2399
2400 destroy_bb_vec_info (bb_vinfo);
2401 return NULL;
2402 }
2403
2404 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2405 {
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "not vectorized: unhandled data access in "
2409 "basic block.\n");
2410
2411 destroy_bb_vec_info (bb_vinfo);
2412 return NULL;
2413 }
2414
2415 vect_pattern_recog (NULL, bb_vinfo);
2416
2417 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2418 {
2419 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2421 "not vectorized: bad data alignment in basic "
2422 "block.\n");
2423
2424 destroy_bb_vec_info (bb_vinfo);
2425 return NULL;
2426 }
2427
2428 /* Check the SLP opportunities in the basic block, analyze and build SLP
2429 trees. */
2430 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2431 {
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434 "not vectorized: failed to find SLP opportunities "
2435 "in basic block.\n");
2436
2437 destroy_bb_vec_info (bb_vinfo);
2438 return NULL;
2439 }
2440
2441 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2442
2443 /* Mark all the statements that we want to vectorize as pure SLP and
2444 relevant. */
2445 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2446 {
2447 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2448 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2449 }
2450
2451 /* Mark all the statements that we do not want to vectorize. */
2452 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2453 !gsi_end_p (gsi); gsi_next (&gsi))
2454 {
2455 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2456 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2457 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2458 }
2459
2460 /* Analyze dependences. At this point all stmts not participating in
2461 vectorization have to be marked. Dependence analysis assumes
2462 that we either vectorize all SLP instances or none at all. */
2463 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2464 {
2465 if (dump_enabled_p ())
2466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467 "not vectorized: unhandled data dependence "
2468 "in basic block.\n");
2469
2470 destroy_bb_vec_info (bb_vinfo);
2471 return NULL;
2472 }
2473
2474 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2475 {
2476 if (dump_enabled_p ())
2477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2478 "not vectorized: unsupported alignment in basic "
2479 "block.\n");
2480 destroy_bb_vec_info (bb_vinfo);
2481 return NULL;
2482 }
2483
2484 if (!vect_slp_analyze_operations (bb_vinfo))
2485 {
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2488 "not vectorized: bad operation in basic block.\n");
2489
2490 destroy_bb_vec_info (bb_vinfo);
2491 return NULL;
2492 }
2493
2494 /* Compute the costs of the SLP instances. */
2495 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2496 {
2497 gimple stmt = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2498 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2499 vect_analyze_slp_cost (NULL, bb_vinfo,
2500 instance, TYPE_VECTOR_SUBPARTS (vectype));
2501 }
2502
2503 /* Cost model: check if the vectorization is worthwhile. */
2504 if (!unlimited_cost_model (NULL)
2505 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2506 {
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2509 "not vectorized: vectorization is not "
2510 "profitable.\n");
2511
2512 destroy_bb_vec_info (bb_vinfo);
2513 return NULL;
2514 }
2515
2516 if (dump_enabled_p ())
2517 dump_printf_loc (MSG_NOTE, vect_location,
2518 "Basic block will be vectorized using SLP\n");
2519
2520 return bb_vinfo;
2521 }
2522
2523
2524 bb_vec_info
2525 vect_slp_analyze_bb (basic_block bb)
2526 {
2527 bb_vec_info bb_vinfo;
2528 int insns = 0;
2529 gimple_stmt_iterator gsi;
2530 unsigned int vector_sizes;
2531
2532 if (dump_enabled_p ())
2533 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2534
2535 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2536 {
2537 gimple stmt = gsi_stmt (gsi);
2538 if (!is_gimple_debug (stmt)
2539 && !gimple_nop_p (stmt)
2540 && gimple_code (stmt) != GIMPLE_LABEL)
2541 insns++;
2542 }
2543
2544 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2545 {
2546 if (dump_enabled_p ())
2547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2548 "not vectorized: too many instructions in "
2549 "basic block.\n");
2550
2551 return NULL;
2552 }
2553
2554 /* Autodetect first vector size we try. */
2555 current_vector_size = 0;
2556 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2557
2558 while (1)
2559 {
2560 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2561 if (bb_vinfo)
2562 return bb_vinfo;
2563
2564 destroy_bb_vec_info (bb_vinfo);
2565
2566 vector_sizes &= ~current_vector_size;
2567 if (vector_sizes == 0
2568 || current_vector_size == 0)
2569 return NULL;
2570
2571 /* Try the next biggest vector size. */
2572 current_vector_size = 1 << floor_log2 (vector_sizes);
2573 if (dump_enabled_p ())
2574 dump_printf_loc (MSG_NOTE, vect_location,
2575 "***** Re-trying analysis with "
2576 "vector size %d\n", current_vector_size);
2577 }
2578 }
2579
2580
2581 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2582 the number of created vector stmts depends on the unrolling factor).
2583 However, the actual number of vector stmts for every SLP node depends on
2584 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2585 should be updated. In this function we assume that the inside costs
2586 calculated in vect_model_xxx_cost are linear in ncopies. */
2587
2588 void
2589 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2590 {
2591 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2592 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2593 slp_instance instance;
2594 stmt_vector_for_cost body_cost_vec;
2595 stmt_info_for_cost *si;
2596 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2597
2598 if (dump_enabled_p ())
2599 dump_printf_loc (MSG_NOTE, vect_location,
2600 "=== vect_update_slp_costs_according_to_vf ===\n");
2601
2602 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2603 {
2604 /* We assume that costs are linear in ncopies. */
2605 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2606
2607 /* Record the instance's instructions in the target cost model.
2608 This was delayed until here because the count of instructions
2609 isn't known beforehand. */
2610 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2611
2612 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2613 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2614 vinfo_for_stmt (si->stmt), si->misalign,
2615 vect_body);
2616 }
2617 }
2618
2619
2620 /* For constant and loop invariant defs of SLP_NODE this function returns
2621 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2622 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2623 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2624 REDUC_INDEX is the index of the reduction operand in the statements, unless
2625 it is -1. */
2626
2627 static void
2628 vect_get_constant_vectors (tree op, slp_tree slp_node,
2629 vec<tree> *vec_oprnds,
2630 unsigned int op_num, unsigned int number_of_vectors,
2631 int reduc_index)
2632 {
2633 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2634 gimple stmt = stmts[0];
2635 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2636 unsigned nunits;
2637 tree vec_cst;
2638 tree *elts;
2639 unsigned j, number_of_places_left_in_vector;
2640 tree vector_type;
2641 tree vop;
2642 int group_size = stmts.length ();
2643 unsigned int vec_num, i;
2644 unsigned number_of_copies = 1;
2645 vec<tree> voprnds;
2646 voprnds.create (number_of_vectors);
2647 bool constant_p, is_store;
2648 tree neutral_op = NULL;
2649 enum tree_code code = gimple_expr_code (stmt);
2650 gimple def_stmt;
2651 struct loop *loop;
2652 gimple_seq ctor_seq = NULL;
2653
2654 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2655 && reduc_index != -1)
2656 {
2657 op_num = reduc_index - 1;
2658 op = gimple_op (stmt, reduc_index);
2659 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2660 we need either neutral operands or the original operands. See
2661 get_initial_def_for_reduction() for details. */
2662 switch (code)
2663 {
2664 case WIDEN_SUM_EXPR:
2665 case DOT_PROD_EXPR:
2666 case PLUS_EXPR:
2667 case MINUS_EXPR:
2668 case BIT_IOR_EXPR:
2669 case BIT_XOR_EXPR:
2670 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2671 neutral_op = build_real (TREE_TYPE (op), dconst0);
2672 else
2673 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2674
2675 break;
2676
2677 case MULT_EXPR:
2678 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2679 neutral_op = build_real (TREE_TYPE (op), dconst1);
2680 else
2681 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2682
2683 break;
2684
2685 case BIT_AND_EXPR:
2686 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2687 break;
2688
2689 /* For MIN/MAX we don't have an easy neutral operand but
2690 the initial values can be used fine here. Only for
2691 a reduction chain we have to force a neutral element. */
2692 case MAX_EXPR:
2693 case MIN_EXPR:
2694 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2695 neutral_op = NULL;
2696 else
2697 {
2698 def_stmt = SSA_NAME_DEF_STMT (op);
2699 loop = (gimple_bb (stmt))->loop_father;
2700 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2701 loop_preheader_edge (loop));
2702 }
2703 break;
2704
2705 default:
2706 neutral_op = NULL;
2707 }
2708 }
2709
2710 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2711 {
2712 is_store = true;
2713 op = gimple_assign_rhs1 (stmt);
2714 }
2715 else
2716 is_store = false;
2717
2718 gcc_assert (op);
2719
2720 if (CONSTANT_CLASS_P (op))
2721 constant_p = true;
2722 else
2723 constant_p = false;
2724
2725 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2726 gcc_assert (vector_type);
2727 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2728
2729 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2730 created vectors. It is greater than 1 if unrolling is performed.
2731
2732 For example, we have two scalar operands, s1 and s2 (e.g., group of
2733 strided accesses of size two), while NUNITS is four (i.e., four scalars
2734 of this type can be packed in a vector). The output vector will contain
2735 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2736 will be 2).
2737
2738 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2739 containing the operands.
2740
2741 For example, NUNITS is four as before, and the group size is 8
2742 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2743 {s5, s6, s7, s8}. */
2744
2745 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2746
2747 number_of_places_left_in_vector = nunits;
2748 elts = XALLOCAVEC (tree, nunits);
2749 bool place_after_defs = false;
2750 for (j = 0; j < number_of_copies; j++)
2751 {
2752 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2753 {
2754 if (is_store)
2755 op = gimple_assign_rhs1 (stmt);
2756 else
2757 {
2758 switch (code)
2759 {
2760 case COND_EXPR:
2761 if (op_num == 0 || op_num == 1)
2762 {
2763 tree cond = gimple_assign_rhs1 (stmt);
2764 op = TREE_OPERAND (cond, op_num);
2765 }
2766 else
2767 {
2768 if (op_num == 2)
2769 op = gimple_assign_rhs2 (stmt);
2770 else
2771 op = gimple_assign_rhs3 (stmt);
2772 }
2773 break;
2774
2775 case CALL_EXPR:
2776 op = gimple_call_arg (stmt, op_num);
2777 break;
2778
2779 case LSHIFT_EXPR:
2780 case RSHIFT_EXPR:
2781 case LROTATE_EXPR:
2782 case RROTATE_EXPR:
2783 op = gimple_op (stmt, op_num + 1);
2784 /* Unlike the other binary operators, shifts/rotates have
2785 the shift count being int, instead of the same type as
2786 the lhs, so make sure the scalar is the right type if
2787 we are dealing with vectors of
2788 long long/long/short/char. */
2789 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2790 op = fold_convert (TREE_TYPE (vector_type), op);
2791 break;
2792
2793 default:
2794 op = gimple_op (stmt, op_num + 1);
2795 break;
2796 }
2797 }
2798
2799 if (reduc_index != -1)
2800 {
2801 loop = (gimple_bb (stmt))->loop_father;
2802 def_stmt = SSA_NAME_DEF_STMT (op);
2803
2804 gcc_assert (loop);
2805
2806 /* Get the def before the loop. In reduction chain we have only
2807 one initial value. */
2808 if ((j != (number_of_copies - 1)
2809 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2810 && i != 0))
2811 && neutral_op)
2812 op = neutral_op;
2813 else
2814 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2815 loop_preheader_edge (loop));
2816 }
2817
2818 /* Create 'vect_ = {op0,op1,...,opn}'. */
2819 number_of_places_left_in_vector--;
2820 tree orig_op = op;
2821 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2822 {
2823 if (CONSTANT_CLASS_P (op))
2824 {
2825 op = fold_unary (VIEW_CONVERT_EXPR,
2826 TREE_TYPE (vector_type), op);
2827 gcc_assert (op && CONSTANT_CLASS_P (op));
2828 }
2829 else
2830 {
2831 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2832 gimple init_stmt;
2833 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2834 init_stmt
2835 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2836 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2837 op = new_temp;
2838 }
2839 }
2840 elts[number_of_places_left_in_vector] = op;
2841 if (!CONSTANT_CLASS_P (op))
2842 constant_p = false;
2843 if (TREE_CODE (orig_op) == SSA_NAME
2844 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2845 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2846 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2847 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2848 place_after_defs = true;
2849
2850 if (number_of_places_left_in_vector == 0)
2851 {
2852 number_of_places_left_in_vector = nunits;
2853
2854 if (constant_p)
2855 vec_cst = build_vector (vector_type, elts);
2856 else
2857 {
2858 vec<constructor_elt, va_gc> *v;
2859 unsigned k;
2860 vec_alloc (v, nunits);
2861 for (k = 0; k < nunits; ++k)
2862 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2863 vec_cst = build_constructor (vector_type, v);
2864 }
2865 tree init;
2866 gimple_stmt_iterator gsi;
2867 if (place_after_defs)
2868 {
2869 gsi = gsi_for_stmt
2870 (vect_find_last_scalar_stmt_in_slp (slp_node));
2871 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2872 }
2873 else
2874 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2875 if (ctor_seq != NULL)
2876 {
2877 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2878 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2879 GSI_SAME_STMT);
2880 ctor_seq = NULL;
2881 }
2882 voprnds.quick_push (init);
2883 place_after_defs = false;
2884 }
2885 }
2886 }
2887
2888 /* Since the vectors are created in the reverse order, we should invert
2889 them. */
2890 vec_num = voprnds.length ();
2891 for (j = vec_num; j != 0; j--)
2892 {
2893 vop = voprnds[j - 1];
2894 vec_oprnds->quick_push (vop);
2895 }
2896
2897 voprnds.release ();
2898
2899 /* In case that VF is greater than the unrolling factor needed for the SLP
2900 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2901 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2902 to replicate the vectors. */
2903 while (number_of_vectors > vec_oprnds->length ())
2904 {
2905 tree neutral_vec = NULL;
2906
2907 if (neutral_op)
2908 {
2909 if (!neutral_vec)
2910 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2911
2912 vec_oprnds->quick_push (neutral_vec);
2913 }
2914 else
2915 {
2916 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2917 vec_oprnds->quick_push (vop);
2918 }
2919 }
2920 }
2921
2922
2923 /* Get vectorized definitions from SLP_NODE that contains corresponding
2924 vectorized def-stmts. */
2925
2926 static void
2927 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2928 {
2929 tree vec_oprnd;
2930 gimple vec_def_stmt;
2931 unsigned int i;
2932
2933 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2934
2935 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2936 {
2937 gcc_assert (vec_def_stmt);
2938 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2939 vec_oprnds->quick_push (vec_oprnd);
2940 }
2941 }
2942
2943
2944 /* Get vectorized definitions for SLP_NODE.
2945 If the scalar definitions are loop invariants or constants, collect them and
2946 call vect_get_constant_vectors() to create vector stmts.
2947 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2948 must be stored in the corresponding child of SLP_NODE, and we call
2949 vect_get_slp_vect_defs () to retrieve them. */
2950
2951 void
2952 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2953 vec<vec<tree> > *vec_oprnds, int reduc_index)
2954 {
2955 gimple first_stmt;
2956 int number_of_vects = 0, i;
2957 unsigned int child_index = 0;
2958 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2959 slp_tree child = NULL;
2960 vec<tree> vec_defs;
2961 tree oprnd;
2962 bool vectorized_defs;
2963
2964 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2965 FOR_EACH_VEC_ELT (ops, i, oprnd)
2966 {
2967 /* For each operand we check if it has vectorized definitions in a child
2968 node or we need to create them (for invariants and constants). We
2969 check if the LHS of the first stmt of the next child matches OPRND.
2970 If it does, we found the correct child. Otherwise, we call
2971 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2972 to check this child node for the next operand. */
2973 vectorized_defs = false;
2974 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2975 {
2976 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2977
2978 /* We have to check both pattern and original def, if available. */
2979 if (child)
2980 {
2981 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2982 gimple related
2983 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2984
2985 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2986 || (related
2987 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2988 {
2989 /* The number of vector defs is determined by the number of
2990 vector statements in the node from which we get those
2991 statements. */
2992 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2993 vectorized_defs = true;
2994 child_index++;
2995 }
2996 }
2997 else
2998 child_index++;
2999 }
3000
3001 if (!vectorized_defs)
3002 {
3003 if (i == 0)
3004 {
3005 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3006 /* Number of vector stmts was calculated according to LHS in
3007 vect_schedule_slp_instance (), fix it by replacing LHS with
3008 RHS, if necessary. See vect_get_smallest_scalar_type () for
3009 details. */
3010 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
3011 &rhs_size_unit);
3012 if (rhs_size_unit != lhs_size_unit)
3013 {
3014 number_of_vects *= rhs_size_unit;
3015 number_of_vects /= lhs_size_unit;
3016 }
3017 }
3018 }
3019
3020 /* Allocate memory for vectorized defs. */
3021 vec_defs = vNULL;
3022 vec_defs.create (number_of_vects);
3023
3024 /* For reduction defs we call vect_get_constant_vectors (), since we are
3025 looking for initial loop invariant values. */
3026 if (vectorized_defs && reduc_index == -1)
3027 /* The defs are already vectorized. */
3028 vect_get_slp_vect_defs (child, &vec_defs);
3029 else
3030 /* Build vectors from scalar defs. */
3031 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3032 number_of_vects, reduc_index);
3033
3034 vec_oprnds->quick_push (vec_defs);
3035
3036 /* For reductions, we only need initial values. */
3037 if (reduc_index != -1)
3038 return;
3039 }
3040 }
3041
3042
3043 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
3044 building a vector of type MASK_TYPE from it) and two input vectors placed in
3045 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
3046 shifting by STRIDE elements of DR_CHAIN for every copy.
3047 (STRIDE is the number of vectorized stmts for NODE divided by the number of
3048 copies).
3049 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
3050 the created stmts must be inserted. */
3051
3052 static inline void
3053 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
3054 tree mask, int first_vec_indx, int second_vec_indx,
3055 gimple_stmt_iterator *gsi, slp_tree node,
3056 tree vectype, vec<tree> dr_chain,
3057 int ncopies, int vect_stmts_counter)
3058 {
3059 tree perm_dest;
3060 gimple perm_stmt = NULL;
3061 stmt_vec_info next_stmt_info;
3062 int i, stride;
3063 tree first_vec, second_vec, data_ref;
3064
3065 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3066
3067 /* Initialize the vect stmts of NODE to properly insert the generated
3068 stmts later. */
3069 for (i = SLP_TREE_VEC_STMTS (node).length ();
3070 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3071 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3072
3073 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3074 for (i = 0; i < ncopies; i++)
3075 {
3076 first_vec = dr_chain[first_vec_indx];
3077 second_vec = dr_chain[second_vec_indx];
3078
3079 /* Generate the permute statement. */
3080 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3081 first_vec, second_vec, mask);
3082 data_ref = make_ssa_name (perm_dest, perm_stmt);
3083 gimple_set_lhs (perm_stmt, data_ref);
3084 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3085
3086 /* Store the vector statement in NODE. */
3087 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3088
3089 first_vec_indx += stride;
3090 second_vec_indx += stride;
3091 }
3092
3093 /* Mark the scalar stmt as vectorized. */
3094 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
3095 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
3096 }
3097
3098
3099 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3100 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3101 representation. Check that the mask is valid and return FALSE if not.
3102 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3103 the next vector, i.e., the current first vector is not needed. */
3104
3105 static bool
3106 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
3107 int mask_nunits, bool only_one_vec, int index,
3108 unsigned char *mask, int *current_mask_element,
3109 bool *need_next_vector, int *number_of_mask_fixes,
3110 bool *mask_fixed, bool *needs_first_vector)
3111 {
3112 int i;
3113
3114 /* Convert to target specific representation. */
3115 *current_mask_element = first_mask_element + m;
3116 /* Adjust the value in case it's a mask for second and third vectors. */
3117 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3118
3119 if (*current_mask_element < 0)
3120 {
3121 if (dump_enabled_p ())
3122 {
3123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3124 "permutation requires past vector ");
3125 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3126 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3127 }
3128 return false;
3129 }
3130
3131 if (*current_mask_element < mask_nunits)
3132 *needs_first_vector = true;
3133
3134 /* We have only one input vector to permute but the mask accesses values in
3135 the next vector as well. */
3136 if (only_one_vec && *current_mask_element >= mask_nunits)
3137 {
3138 if (dump_enabled_p ())
3139 {
3140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3141 "permutation requires at least two vectors ");
3142 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3143 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3144 }
3145
3146 return false;
3147 }
3148
3149 /* The mask requires the next vector. */
3150 while (*current_mask_element >= mask_nunits * 2)
3151 {
3152 if (*needs_first_vector || *mask_fixed)
3153 {
3154 /* We either need the first vector too or have already moved to the
3155 next vector. In both cases, this permutation needs three
3156 vectors. */
3157 if (dump_enabled_p ())
3158 {
3159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3160 "permutation requires at "
3161 "least three vectors ");
3162 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3163 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3164 }
3165
3166 return false;
3167 }
3168
3169 /* We move to the next vector, dropping the first one and working with
3170 the second and the third - we need to adjust the values of the mask
3171 accordingly. */
3172 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3173
3174 for (i = 0; i < index; i++)
3175 mask[i] -= mask_nunits * *number_of_mask_fixes;
3176
3177 (*number_of_mask_fixes)++;
3178 *mask_fixed = true;
3179 }
3180
3181 *need_next_vector = *mask_fixed;
3182
3183 /* This was the last element of this mask. Start a new one. */
3184 if (index == mask_nunits - 1)
3185 {
3186 *number_of_mask_fixes = 1;
3187 *mask_fixed = false;
3188 *needs_first_vector = false;
3189 }
3190
3191 return true;
3192 }
3193
3194
3195 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3196 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3197 permute statements for the SLP node NODE of the SLP instance
3198 SLP_NODE_INSTANCE. */
3199
3200 bool
3201 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3202 gimple_stmt_iterator *gsi, int vf,
3203 slp_instance slp_node_instance, bool analyze_only)
3204 {
3205 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3206 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3207 tree mask_element_type = NULL_TREE, mask_type;
3208 int i, j, k, nunits, vec_index = 0, scalar_index;
3209 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3210 gimple next_scalar_stmt;
3211 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3212 int first_mask_element;
3213 int index, unroll_factor, current_mask_element, ncopies;
3214 unsigned char *mask;
3215 bool only_one_vec = false, need_next_vector = false;
3216 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3217 int number_of_mask_fixes = 1;
3218 bool mask_fixed = false;
3219 bool needs_first_vector = false;
3220 machine_mode mode;
3221
3222 mode = TYPE_MODE (vectype);
3223
3224 if (!can_vec_perm_p (mode, false, NULL))
3225 {
3226 if (dump_enabled_p ())
3227 {
3228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229 "no vect permute for ");
3230 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3231 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3232 }
3233 return false;
3234 }
3235
3236 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3237 same size as the vector element being permuted. */
3238 mask_element_type = lang_hooks.types.type_for_mode
3239 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3240 mask_type = get_vectype_for_scalar_type (mask_element_type);
3241 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3242 mask = XALLOCAVEC (unsigned char, nunits);
3243 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3244
3245 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3246 unrolling factor. */
3247 orig_vec_stmts_num = group_size *
3248 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3249 if (orig_vec_stmts_num == 1)
3250 only_one_vec = true;
3251
3252 /* Number of copies is determined by the final vectorization factor
3253 relatively to SLP_NODE_INSTANCE unrolling factor. */
3254 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3255
3256 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3257 return false;
3258
3259 /* Generate permutation masks for every NODE. Number of masks for each NODE
3260 is equal to GROUP_SIZE.
3261 E.g., we have a group of three nodes with three loads from the same
3262 location in each node, and the vector size is 4. I.e., we have a
3263 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3264 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3265 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3266 ...
3267
3268 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3269 The last mask is illegal since we assume two operands for permute
3270 operation, and the mask element values can't be outside that range.
3271 Hence, the last mask must be converted into {2,5,5,5}.
3272 For the first two permutations we need the first and the second input
3273 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3274 we need the second and the third vectors: {b1,c1,a2,b2} and
3275 {c2,a3,b3,c3}. */
3276
3277 {
3278 scalar_index = 0;
3279 index = 0;
3280 vect_stmts_counter = 0;
3281 vec_index = 0;
3282 first_vec_index = vec_index++;
3283 if (only_one_vec)
3284 second_vec_index = first_vec_index;
3285 else
3286 second_vec_index = vec_index++;
3287
3288 for (j = 0; j < unroll_factor; j++)
3289 {
3290 for (k = 0; k < group_size; k++)
3291 {
3292 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3293 first_mask_element = i + j * group_size;
3294 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3295 nunits, only_one_vec, index,
3296 mask, &current_mask_element,
3297 &need_next_vector,
3298 &number_of_mask_fixes, &mask_fixed,
3299 &needs_first_vector))
3300 return false;
3301 gcc_assert (current_mask_element >= 0
3302 && current_mask_element < 2 * nunits);
3303 mask[index++] = current_mask_element;
3304
3305 if (index == nunits)
3306 {
3307 index = 0;
3308 if (!can_vec_perm_p (mode, false, mask))
3309 {
3310 if (dump_enabled_p ())
3311 {
3312 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3313 vect_location,
3314 "unsupported vect permute { ");
3315 for (i = 0; i < nunits; ++i)
3316 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3317 mask[i]);
3318 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3319 }
3320 return false;
3321 }
3322
3323 if (!analyze_only)
3324 {
3325 int l;
3326 tree mask_vec, *mask_elts;
3327 mask_elts = XALLOCAVEC (tree, nunits);
3328 for (l = 0; l < nunits; ++l)
3329 mask_elts[l] = build_int_cst (mask_element_type,
3330 mask[l]);
3331 mask_vec = build_vector (mask_type, mask_elts);
3332
3333 if (need_next_vector)
3334 {
3335 first_vec_index = second_vec_index;
3336 second_vec_index = vec_index;
3337 }
3338
3339 next_scalar_stmt
3340 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3341
3342 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3343 mask_vec, first_vec_index, second_vec_index,
3344 gsi, node, vectype, dr_chain,
3345 ncopies, vect_stmts_counter++);
3346 }
3347 }
3348 }
3349 }
3350 }
3351
3352 return true;
3353 }
3354
3355
3356
3357 /* Vectorize SLP instance tree in postorder. */
3358
3359 static bool
3360 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3361 unsigned int vectorization_factor)
3362 {
3363 gimple stmt;
3364 bool grouped_store, is_store;
3365 gimple_stmt_iterator si;
3366 stmt_vec_info stmt_info;
3367 unsigned int vec_stmts_size, nunits, group_size;
3368 tree vectype;
3369 int i;
3370 slp_tree child;
3371
3372 if (!node)
3373 return false;
3374
3375 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3376 vect_schedule_slp_instance (child, instance, vectorization_factor);
3377
3378 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3379 stmt_info = vinfo_for_stmt (stmt);
3380
3381 /* VECTYPE is the type of the destination. */
3382 vectype = STMT_VINFO_VECTYPE (stmt_info);
3383 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3384 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3385
3386 /* For each SLP instance calculate number of vector stmts to be created
3387 for the scalar stmts in each node of the SLP tree. Number of vector
3388 elements in one vector iteration is the number of scalar elements in
3389 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3390 size. */
3391 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3392
3393 if (!SLP_TREE_VEC_STMTS (node).exists ())
3394 {
3395 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3396 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3397 }
3398
3399 if (dump_enabled_p ())
3400 {
3401 dump_printf_loc (MSG_NOTE,vect_location,
3402 "------>vectorizing SLP node starting from: ");
3403 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3404 dump_printf (MSG_NOTE, "\n");
3405 }
3406
3407 /* Vectorized stmts go before the last scalar stmt which is where
3408 all uses are ready. */
3409 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3410
3411 /* Mark the first element of the reduction chain as reduction to properly
3412 transform the node. In the analysis phase only the last element of the
3413 chain is marked as reduction. */
3414 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3415 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3416 {
3417 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3418 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3419 }
3420
3421 /* Handle two-operation SLP nodes by vectorizing the group with
3422 both operations and then performing a merge. */
3423 if (SLP_TREE_TWO_OPERATORS (node))
3424 {
3425 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3426 enum tree_code ocode;
3427 gimple ostmt;
3428 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3429 bool allsame = true;
3430 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3431 if (gimple_assign_rhs_code (ostmt) != code0)
3432 {
3433 mask[i] = 1;
3434 allsame = false;
3435 ocode = gimple_assign_rhs_code (ostmt);
3436 }
3437 else
3438 mask[i] = 0;
3439 if (!allsame)
3440 {
3441 vec<gimple> v0;
3442 vec<gimple> v1;
3443 unsigned j;
3444 tree tmask = NULL_TREE;
3445 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3446 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3447 SLP_TREE_VEC_STMTS (node).truncate (0);
3448 gimple_assign_set_rhs_code (stmt, ocode);
3449 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3450 gimple_assign_set_rhs_code (stmt, code0);
3451 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3452 SLP_TREE_VEC_STMTS (node).truncate (0);
3453 tree meltype = build_nonstandard_integer_type
3454 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3455 tree mvectype = get_same_sized_vectype (meltype, vectype);
3456 unsigned k = 0, l;
3457 for (j = 0; j < v0.length (); ++j)
3458 {
3459 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3460 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3461 {
3462 if (k > group_size)
3463 k = 0;
3464 melts[l] = build_int_cst
3465 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3466 }
3467 tmask = build_vector (mvectype, melts);
3468
3469 /* ??? Not all targets support a VEC_PERM_EXPR with a
3470 constant mask that would translate to a vec_merge RTX
3471 (with their vec_perm_const_ok). We can either not
3472 vectorize in that case or let veclower do its job.
3473 Unfortunately that isn't too great and at least for
3474 plus/minus we'd eventually like to match targets
3475 vector addsub instructions. */
3476 gimple vstmt;
3477 vstmt = gimple_build_assign (make_ssa_name (vectype),
3478 VEC_PERM_EXPR,
3479 gimple_assign_lhs (v0[j]),
3480 gimple_assign_lhs (v1[j]), tmask);
3481 vect_finish_stmt_generation (stmt, vstmt, &si);
3482 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3483 }
3484 v0.release ();
3485 v1.release ();
3486 return false;
3487 }
3488 }
3489 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3490 return is_store;
3491 }
3492
3493 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3494 For loop vectorization this is done in vectorizable_call, but for SLP
3495 it needs to be deferred until end of vect_schedule_slp, because multiple
3496 SLP instances may refer to the same scalar stmt. */
3497
3498 static void
3499 vect_remove_slp_scalar_calls (slp_tree node)
3500 {
3501 gimple stmt, new_stmt;
3502 gimple_stmt_iterator gsi;
3503 int i;
3504 slp_tree child;
3505 tree lhs;
3506 stmt_vec_info stmt_info;
3507
3508 if (!node)
3509 return;
3510
3511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3512 vect_remove_slp_scalar_calls (child);
3513
3514 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3515 {
3516 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3517 continue;
3518 stmt_info = vinfo_for_stmt (stmt);
3519 if (stmt_info == NULL
3520 || is_pattern_stmt_p (stmt_info)
3521 || !PURE_SLP_STMT (stmt_info))
3522 continue;
3523 lhs = gimple_call_lhs (stmt);
3524 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3525 set_vinfo_for_stmt (new_stmt, stmt_info);
3526 set_vinfo_for_stmt (stmt, NULL);
3527 STMT_VINFO_STMT (stmt_info) = new_stmt;
3528 gsi = gsi_for_stmt (stmt);
3529 gsi_replace (&gsi, new_stmt, false);
3530 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3531 }
3532 }
3533
3534 /* Generate vector code for all SLP instances in the loop/basic block. */
3535
3536 bool
3537 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3538 {
3539 vec<slp_instance> slp_instances;
3540 slp_instance instance;
3541 unsigned int i, vf;
3542 bool is_store = false;
3543
3544 if (loop_vinfo)
3545 {
3546 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3547 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3548 }
3549 else
3550 {
3551 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3552 vf = 1;
3553 }
3554
3555 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3556 {
3557 /* Schedule the tree of INSTANCE. */
3558 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3559 instance, vf);
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "vectorizing stmts using SLP.\n");
3563 }
3564
3565 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3566 {
3567 slp_tree root = SLP_INSTANCE_TREE (instance);
3568 gimple store;
3569 unsigned int j;
3570 gimple_stmt_iterator gsi;
3571
3572 /* Remove scalar call stmts. Do not do this for basic-block
3573 vectorization as not all uses may be vectorized.
3574 ??? Why should this be necessary? DCE should be able to
3575 remove the stmts itself.
3576 ??? For BB vectorization we can as well remove scalar
3577 stmts starting from the SLP tree root if they have no
3578 uses. */
3579 if (loop_vinfo)
3580 vect_remove_slp_scalar_calls (root);
3581
3582 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3583 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3584 {
3585 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3586 break;
3587
3588 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3589 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3590 /* Free the attached stmt_vec_info and remove the stmt. */
3591 gsi = gsi_for_stmt (store);
3592 unlink_stmt_vdef (store);
3593 gsi_remove (&gsi, true);
3594 release_defs (store);
3595 free_stmt_vec_info (store);
3596 }
3597 }
3598
3599 return is_store;
3600 }
3601
3602
3603 /* Vectorize the basic block. */
3604
3605 void
3606 vect_slp_transform_bb (basic_block bb)
3607 {
3608 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3609 gimple_stmt_iterator si;
3610
3611 gcc_assert (bb_vinfo);
3612
3613 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3615
3616 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3617 {
3618 gimple stmt = gsi_stmt (si);
3619 stmt_vec_info stmt_info;
3620
3621 if (dump_enabled_p ())
3622 {
3623 dump_printf_loc (MSG_NOTE, vect_location,
3624 "------>SLPing statement: ");
3625 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3626 dump_printf (MSG_NOTE, "\n");
3627 }
3628
3629 stmt_info = vinfo_for_stmt (stmt);
3630 gcc_assert (stmt_info);
3631
3632 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3633 if (STMT_SLP_TYPE (stmt_info))
3634 {
3635 vect_schedule_slp (NULL, bb_vinfo);
3636 break;
3637 }
3638 }
3639
3640 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "BASIC BLOCK VECTORIZED\n");
3643
3644 destroy_bb_vec_info (bb_vinfo);
3645 }