tree-vect-slp.c (vect_find_first_load_in_slp_instance): Remove.
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "target.h"
40 #include "predict.h"
41 #include "hard-reg-set.h"
42 #include "function.h"
43 #include "basic-block.h"
44 #include "gimple-pretty-print.h"
45 #include "tree-ssa-alias.h"
46 #include "internal-fn.h"
47 #include "gimple-expr.h"
48 #include "is-a.h"
49 #include "gimple.h"
50 #include "gimple-iterator.h"
51 #include "gimple-ssa.h"
52 #include "tree-phinodes.h"
53 #include "ssa-iterators.h"
54 #include "stringpool.h"
55 #include "tree-ssanames.h"
56 #include "tree-pass.h"
57 #include "cfgloop.h"
58 #include "hashtab.h"
59 #include "rtl.h"
60 #include "flags.h"
61 #include "statistics.h"
62 #include "real.h"
63 #include "fixed-value.h"
64 #include "insn-config.h"
65 #include "expmed.h"
66 #include "dojump.h"
67 #include "explow.h"
68 #include "calls.h"
69 #include "emit-rtl.h"
70 #include "varasm.h"
71 #include "stmt.h"
72 #include "expr.h"
73 #include "recog.h" /* FIXME: for insn_data */
74 #include "insn-codes.h"
75 #include "optabs.h"
76 #include "tree-vectorizer.h"
77 #include "langhooks.h"
78 #include "gimple-walk.h"
79
80 /* Extract the location of the basic block in the source code.
81 Return the basic block location if succeed and NULL if not. */
82
83 source_location
84 find_bb_location (basic_block bb)
85 {
86 gimple stmt = NULL;
87 gimple_stmt_iterator si;
88
89 if (!bb)
90 return UNKNOWN_LOCATION;
91
92 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93 {
94 stmt = gsi_stmt (si);
95 if (gimple_location (stmt) != UNKNOWN_LOCATION)
96 return gimple_location (stmt);
97 }
98
99 return UNKNOWN_LOCATION;
100 }
101
102
103 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
104
105 static void
106 vect_free_slp_tree (slp_tree node)
107 {
108 int i;
109 slp_tree child;
110
111 if (!node)
112 return;
113
114 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115 vect_free_slp_tree (child);
116
117 SLP_TREE_CHILDREN (node).release ();
118 SLP_TREE_SCALAR_STMTS (node).release ();
119 SLP_TREE_VEC_STMTS (node).release ();
120 SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122 free (node);
123 }
124
125
126 /* Free the memory allocated for the SLP instance. */
127
128 void
129 vect_free_slp_instance (slp_instance instance)
130 {
131 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132 SLP_INSTANCE_LOADS (instance).release ();
133 SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134 free (instance);
135 }
136
137
138 /* Create an SLP node for SCALAR_STMTS. */
139
140 static slp_tree
141 vect_create_new_slp_node (vec<gimple> scalar_stmts)
142 {
143 slp_tree node;
144 gimple stmt = scalar_stmts[0];
145 unsigned int nops;
146
147 if (is_gimple_call (stmt))
148 nops = gimple_call_num_args (stmt);
149 else if (is_gimple_assign (stmt))
150 {
151 nops = gimple_num_ops (stmt) - 1;
152 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153 nops++;
154 }
155 else
156 return NULL;
157
158 node = XNEW (struct _slp_tree);
159 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160 SLP_TREE_VEC_STMTS (node).create (0);
161 SLP_TREE_CHILDREN (node).create (nops);
162 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164 return node;
165 }
166
167
168 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169 operand. */
170 static vec<slp_oprnd_info>
171 vect_create_oprnd_info (int nops, int group_size)
172 {
173 int i;
174 slp_oprnd_info oprnd_info;
175 vec<slp_oprnd_info> oprnds_info;
176
177 oprnds_info.create (nops);
178 for (i = 0; i < nops; i++)
179 {
180 oprnd_info = XNEW (struct _slp_oprnd_info);
181 oprnd_info->def_stmts.create (group_size);
182 oprnd_info->first_dt = vect_uninitialized_def;
183 oprnd_info->first_op_type = NULL_TREE;
184 oprnd_info->first_pattern = false;
185 oprnds_info.quick_push (oprnd_info);
186 }
187
188 return oprnds_info;
189 }
190
191
192 /* Free operands info. */
193
194 static void
195 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196 {
197 int i;
198 slp_oprnd_info oprnd_info;
199
200 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201 {
202 oprnd_info->def_stmts.release ();
203 XDELETE (oprnd_info);
204 }
205
206 oprnds_info.release ();
207 }
208
209
210 /* Find the place of the data-ref in STMT in the interleaving chain that starts
211 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
212
213 static int
214 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215 {
216 gimple next_stmt = first_stmt;
217 int result = 0;
218
219 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220 return -1;
221
222 do
223 {
224 if (next_stmt == stmt)
225 return result;
226 result++;
227 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228 }
229 while (next_stmt);
230
231 return -1;
232 }
233
234
235 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236 they are of a valid type and that they match the defs of the first stmt of
237 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
238 return -1, if the error could be corrected by swapping operands of the
239 operation return 1, if everything is ok return 0. */
240
241 static int
242 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243 gimple stmt, bool first,
244 vec<slp_oprnd_info> *oprnds_info)
245 {
246 tree oprnd;
247 unsigned int i, number_of_oprnds;
248 tree def;
249 gimple def_stmt;
250 enum vect_def_type dt = vect_uninitialized_def;
251 struct loop *loop = NULL;
252 bool pattern = false;
253 slp_oprnd_info oprnd_info;
254 int first_op_idx = 1;
255 bool commutative = false;
256 bool first_op_cond = false;
257
258 if (loop_vinfo)
259 loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261 if (is_gimple_call (stmt))
262 {
263 number_of_oprnds = gimple_call_num_args (stmt);
264 first_op_idx = 3;
265 }
266 else if (is_gimple_assign (stmt))
267 {
268 enum tree_code code = gimple_assign_rhs_code (stmt);
269 number_of_oprnds = gimple_num_ops (stmt) - 1;
270 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271 {
272 first_op_cond = true;
273 commutative = true;
274 number_of_oprnds++;
275 }
276 else
277 commutative = commutative_tree_code (code);
278 }
279 else
280 return -1;
281
282 bool swapped = false;
283 for (i = 0; i < number_of_oprnds; i++)
284 {
285 again:
286 if (first_op_cond)
287 {
288 if (i == 0 || i == 1)
289 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290 swapped ? !i : i);
291 else
292 oprnd = gimple_op (stmt, first_op_idx + i - 1);
293 }
294 else
295 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297 oprnd_info = (*oprnds_info)[i];
298
299 if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300 &def, &dt)
301 || (!def_stmt && dt != vect_constant_def))
302 {
303 if (dump_enabled_p ())
304 {
305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 "Build SLP failed: can't find def for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309 }
310
311 return -1;
312 }
313
314 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315 from the pattern. Check that all the stmts of the node are in the
316 pattern. */
317 if (def_stmt && gimple_bb (def_stmt)
318 && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319 || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320 && gimple_code (def_stmt) != GIMPLE_PHI))
321 && vinfo_for_stmt (def_stmt)
322 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325 {
326 pattern = true;
327 if (!first && !oprnd_info->first_pattern)
328 {
329 if (i == 0
330 && !swapped
331 && commutative)
332 {
333 swapped = true;
334 goto again;
335 }
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 "Build SLP failed: some of the stmts"
341 " are in a pattern, and others are not ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 }
345
346 return 1;
347 }
348
349 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352 if (dt == vect_unknown_def_type)
353 {
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "Unsupported pattern.\n");
357 return -1;
358 }
359
360 switch (gimple_code (def_stmt))
361 {
362 case GIMPLE_PHI:
363 def = gimple_phi_result (def_stmt);
364 break;
365
366 case GIMPLE_ASSIGN:
367 def = gimple_assign_lhs (def_stmt);
368 break;
369
370 default:
371 if (dump_enabled_p ())
372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373 "unsupported defining stmt:\n");
374 return -1;
375 }
376 }
377
378 if (first)
379 {
380 oprnd_info->first_dt = dt;
381 oprnd_info->first_pattern = pattern;
382 oprnd_info->first_op_type = TREE_TYPE (oprnd);
383 }
384 else
385 {
386 /* Not first stmt of the group, check that the def-stmt/s match
387 the def-stmt/s of the first stmt. Allow different definition
388 types for reduction chains: the first stmt must be a
389 vect_reduction_def (a phi node), and the rest
390 vect_internal_def. */
391 if (((oprnd_info->first_dt != dt
392 && !(oprnd_info->first_dt == vect_reduction_def
393 && dt == vect_internal_def)
394 && !((oprnd_info->first_dt == vect_external_def
395 || oprnd_info->first_dt == vect_constant_def)
396 && (dt == vect_external_def
397 || dt == vect_constant_def)))
398 || !types_compatible_p (oprnd_info->first_op_type,
399 TREE_TYPE (oprnd))))
400 {
401 /* Try swapping operands if we got a mismatch. */
402 if (i == 0
403 && !swapped
404 && commutative)
405 {
406 swapped = true;
407 goto again;
408 }
409
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
413
414 return 1;
415 }
416 }
417
418 /* Check the types of the definitions. */
419 switch (dt)
420 {
421 case vect_constant_def:
422 case vect_external_def:
423 case vect_reduction_def:
424 break;
425
426 case vect_internal_def:
427 oprnd_info->def_stmts.quick_push (def_stmt);
428 break;
429
430 default:
431 /* FORNOW: Not supported. */
432 if (dump_enabled_p ())
433 {
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: illegal type of def ");
436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438 }
439
440 return -1;
441 }
442 }
443
444 /* Swap operands. */
445 if (swapped)
446 {
447 if (first_op_cond)
448 {
449 tree cond = gimple_assign_rhs1 (stmt);
450 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451 &TREE_OPERAND (cond, 1));
452 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453 }
454 else
455 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456 gimple_assign_rhs2_ptr (stmt));
457 }
458
459 return 0;
460 }
461
462
463 /* Verify if the scalar stmts STMTS are isomorphic, require data
464 permutation or are of unsupported types of operation. Return
465 true if they are, otherwise return false and indicate in *MATCHES
466 which stmts are not isomorphic to the first one. If MATCHES[0]
467 is false then this indicates the comparison could not be
468 carried out or the stmts will never be vectorized by SLP. */
469
470 static bool
471 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472 vec<gimple> stmts, unsigned int group_size,
473 unsigned nops, unsigned int *max_nunits,
474 unsigned int vectorization_factor, bool *matches)
475 {
476 unsigned int i;
477 gimple stmt = stmts[0];
478 enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479 enum tree_code first_cond_code = ERROR_MARK;
480 tree lhs;
481 bool need_same_oprnds = false;
482 tree vectype, scalar_type, first_op1 = NULL_TREE;
483 optab optab;
484 int icode;
485 machine_mode optab_op2_mode;
486 machine_mode vec_mode;
487 struct data_reference *first_dr;
488 HOST_WIDE_INT dummy;
489 gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490 tree cond;
491
492 /* For every stmt in NODE find its def stmt/s. */
493 FOR_EACH_VEC_ELT (stmts, i, stmt)
494 {
495 matches[i] = false;
496
497 if (dump_enabled_p ())
498 {
499 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501 dump_printf (MSG_NOTE, "\n");
502 }
503
504 /* Fail to vectorize statements marked as unvectorizable. */
505 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506 {
507 if (dump_enabled_p ())
508 {
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 "Build SLP failed: unvectorizable statement ");
511 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513 }
514 /* Fatal mismatch. */
515 matches[0] = false;
516 return false;
517 }
518
519 lhs = gimple_get_lhs (stmt);
520 if (lhs == NULL_TREE)
521 {
522 if (dump_enabled_p ())
523 {
524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525 "Build SLP failed: not GIMPLE_ASSIGN nor "
526 "GIMPLE_CALL ");
527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529 }
530 /* Fatal mismatch. */
531 matches[0] = false;
532 return false;
533 }
534
535 if (is_gimple_assign (stmt)
536 && gimple_assign_rhs_code (stmt) == COND_EXPR
537 && (cond = gimple_assign_rhs1 (stmt))
538 && !COMPARISON_CLASS_P (cond))
539 {
540 if (dump_enabled_p ())
541 {
542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543 "Build SLP failed: condition is not "
544 "comparison ");
545 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547 }
548 /* Fatal mismatch. */
549 matches[0] = false;
550 return false;
551 }
552
553 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554 vectype = get_vectype_for_scalar_type (scalar_type);
555 if (!vectype)
556 {
557 if (dump_enabled_p ())
558 {
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: unsupported data-type ");
561 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562 scalar_type);
563 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564 }
565 /* Fatal mismatch. */
566 matches[0] = false;
567 return false;
568 }
569
570 /* In case of multiple types we need to detect the smallest type. */
571 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572 {
573 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574 if (bb_vinfo)
575 vectorization_factor = *max_nunits;
576 }
577
578 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579 {
580 rhs_code = CALL_EXPR;
581 if (gimple_call_internal_p (call_stmt)
582 || gimple_call_tail_p (call_stmt)
583 || gimple_call_noreturn_p (call_stmt)
584 || !gimple_call_nothrow_p (call_stmt)
585 || gimple_call_chain (call_stmt))
586 {
587 if (dump_enabled_p ())
588 {
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: unsupported call type ");
591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592 call_stmt, 0);
593 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594 }
595 /* Fatal mismatch. */
596 matches[0] = false;
597 return false;
598 }
599 }
600 else
601 rhs_code = gimple_assign_rhs_code (stmt);
602
603 /* Check the operation. */
604 if (i == 0)
605 {
606 first_stmt_code = rhs_code;
607
608 /* Shift arguments should be equal in all the packed stmts for a
609 vector shift with scalar shift operand. */
610 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611 || rhs_code == LROTATE_EXPR
612 || rhs_code == RROTATE_EXPR)
613 {
614 vec_mode = TYPE_MODE (vectype);
615
616 /* First see if we have a vector/vector shift. */
617 optab = optab_for_tree_code (rhs_code, vectype,
618 optab_vector);
619
620 if (!optab
621 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622 {
623 /* No vector/vector shift, try for a vector/scalar shift. */
624 optab = optab_for_tree_code (rhs_code, vectype,
625 optab_scalar);
626
627 if (!optab)
628 {
629 if (dump_enabled_p ())
630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631 "Build SLP failed: no optab.\n");
632 /* Fatal mismatch. */
633 matches[0] = false;
634 return false;
635 }
636 icode = (int) optab_handler (optab, vec_mode);
637 if (icode == CODE_FOR_nothing)
638 {
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "Build SLP failed: "
642 "op not supported by target.\n");
643 /* Fatal mismatch. */
644 matches[0] = false;
645 return false;
646 }
647 optab_op2_mode = insn_data[icode].operand[2].mode;
648 if (!VECTOR_MODE_P (optab_op2_mode))
649 {
650 need_same_oprnds = true;
651 first_op1 = gimple_assign_rhs2 (stmt);
652 }
653 }
654 }
655 else if (rhs_code == WIDEN_LSHIFT_EXPR)
656 {
657 need_same_oprnds = true;
658 first_op1 = gimple_assign_rhs2 (stmt);
659 }
660 }
661 else
662 {
663 if (first_stmt_code != rhs_code
664 && (first_stmt_code != IMAGPART_EXPR
665 || rhs_code != REALPART_EXPR)
666 && (first_stmt_code != REALPART_EXPR
667 || rhs_code != IMAGPART_EXPR)
668 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669 && (first_stmt_code == ARRAY_REF
670 || first_stmt_code == BIT_FIELD_REF
671 || first_stmt_code == INDIRECT_REF
672 || first_stmt_code == COMPONENT_REF
673 || first_stmt_code == MEM_REF)))
674 {
675 if (dump_enabled_p ())
676 {
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: different operation "
679 "in stmt ");
680 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682 }
683 /* Mismatch. */
684 continue;
685 }
686
687 if (need_same_oprnds
688 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689 {
690 if (dump_enabled_p ())
691 {
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 "Build SLP failed: different shift "
694 "arguments in ");
695 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697 }
698 /* Mismatch. */
699 continue;
700 }
701
702 if (rhs_code == CALL_EXPR)
703 {
704 gimple first_stmt = stmts[0];
705 if (gimple_call_num_args (stmt) != nops
706 || !operand_equal_p (gimple_call_fn (first_stmt),
707 gimple_call_fn (stmt), 0)
708 || gimple_call_fntype (first_stmt)
709 != gimple_call_fntype (stmt))
710 {
711 if (dump_enabled_p ())
712 {
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: different calls in ");
715 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716 stmt, 0);
717 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718 }
719 /* Mismatch. */
720 continue;
721 }
722 }
723 }
724
725 /* Grouped store or load. */
726 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727 {
728 if (REFERENCE_CLASS_P (lhs))
729 {
730 /* Store. */
731 ;
732 }
733 else
734 {
735 /* Load. */
736 unsigned unrolling_factor
737 = least_common_multiple
738 (*max_nunits, group_size) / group_size;
739 /* FORNOW: Check that there is no gap between the loads
740 and no gap between the groups when we need to load
741 multiple groups at once.
742 ??? We should enhance this to only disallow gaps
743 inside vectors. */
744 if ((unrolling_factor > 1
745 && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746 && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747 /* If the group is split up then GROUP_GAP
748 isn't correct here, nor is GROUP_FIRST_ELEMENT. */
749 || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751 && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752 {
753 if (dump_enabled_p ())
754 {
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 "Build SLP failed: grouped "
757 "loads have gaps ");
758 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759 stmt, 0);
760 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761 }
762 /* Fatal mismatch. */
763 matches[0] = false;
764 return false;
765 }
766
767 /* Check that the size of interleaved loads group is not
768 greater than the SLP group size. */
769 unsigned ncopies
770 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771 if (loop_vinfo
772 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774 - GROUP_GAP (vinfo_for_stmt (stmt)))
775 > ncopies * group_size))
776 {
777 if (dump_enabled_p ())
778 {
779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780 "Build SLP failed: the number "
781 "of interleaved loads is greater than "
782 "the SLP group size ");
783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784 stmt, 0);
785 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786 }
787 /* Fatal mismatch. */
788 matches[0] = false;
789 return false;
790 }
791
792 old_first_load = first_load;
793 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794 if (prev_first_load)
795 {
796 /* Check that there are no loads from different interleaving
797 chains in the same node. */
798 if (prev_first_load != first_load)
799 {
800 if (dump_enabled_p ())
801 {
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803 vect_location,
804 "Build SLP failed: different "
805 "interleaving chains in one node ");
806 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807 stmt, 0);
808 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809 }
810 /* Mismatch. */
811 continue;
812 }
813 }
814 else
815 prev_first_load = first_load;
816
817 /* In some cases a group of loads is just the same load
818 repeated N times. Only analyze its cost once. */
819 if (first_load == stmt && old_first_load != first_load)
820 {
821 first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822 if (vect_supportable_dr_alignment (first_dr, false)
823 == dr_unaligned_unsupported)
824 {
825 if (dump_enabled_p ())
826 {
827 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828 vect_location,
829 "Build SLP failed: unsupported "
830 "unaligned load ");
831 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832 stmt, 0);
833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834 }
835 /* Fatal mismatch. */
836 matches[0] = false;
837 return false;
838 }
839 }
840 }
841 } /* Grouped access. */
842 else
843 {
844 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845 {
846 /* Not grouped load. */
847 if (dump_enabled_p ())
848 {
849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850 "Build SLP failed: not grouped load ");
851 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853 }
854
855 /* FORNOW: Not grouped loads are not supported. */
856 /* Fatal mismatch. */
857 matches[0] = false;
858 return false;
859 }
860
861 /* Not memory operation. */
862 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863 && TREE_CODE_CLASS (rhs_code) != tcc_unary
864 && rhs_code != COND_EXPR
865 && rhs_code != CALL_EXPR)
866 {
867 if (dump_enabled_p ())
868 {
869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870 "Build SLP failed: operation");
871 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874 }
875 /* Fatal mismatch. */
876 matches[0] = false;
877 return false;
878 }
879
880 if (rhs_code == COND_EXPR)
881 {
882 tree cond_expr = gimple_assign_rhs1 (stmt);
883
884 if (i == 0)
885 first_cond_code = TREE_CODE (cond_expr);
886 else if (first_cond_code != TREE_CODE (cond_expr))
887 {
888 if (dump_enabled_p ())
889 {
890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891 "Build SLP failed: different"
892 " operation");
893 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894 stmt, 0);
895 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896 }
897 /* Mismatch. */
898 continue;
899 }
900 }
901 }
902
903 matches[i] = true;
904 }
905
906 for (i = 0; i < group_size; ++i)
907 if (!matches[i])
908 return false;
909
910 return true;
911 }
912
913 /* Recursively build an SLP tree starting from NODE.
914 Fail (and return a value not equal to zero) if def-stmts are not
915 isomorphic, require data permutation or are of unsupported types of
916 operation. Otherwise, return 0.
917 The value returned is the depth in the SLP tree where a mismatch
918 was found. */
919
920 static bool
921 vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922 slp_tree *node, unsigned int group_size,
923 unsigned int *max_nunits,
924 vec<slp_tree> *loads,
925 unsigned int vectorization_factor,
926 bool *matches, unsigned *npermutes, unsigned *tree_size,
927 unsigned max_tree_size)
928 {
929 unsigned nops, i, this_tree_size = 0;
930 gimple stmt;
931
932 matches[0] = false;
933
934 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935 if (is_gimple_call (stmt))
936 nops = gimple_call_num_args (stmt);
937 else if (is_gimple_assign (stmt))
938 {
939 nops = gimple_num_ops (stmt) - 1;
940 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941 nops++;
942 }
943 else
944 return false;
945
946 if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948 max_nunits, vectorization_factor, matches))
949 return false;
950
951 /* If the SLP node is a load, terminate the recursion. */
952 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954 {
955 loads->safe_push (*node);
956 return true;
957 }
958
959 /* Get at the operands, verifying they are compatible. */
960 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961 slp_oprnd_info oprnd_info;
962 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963 {
964 switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965 stmt, (i == 0), &oprnds_info))
966 {
967 case 0:
968 break;
969 case -1:
970 matches[0] = false;
971 vect_free_oprnd_info (oprnds_info);
972 return false;
973 case 1:
974 matches[i] = false;
975 break;
976 }
977 }
978 for (i = 0; i < group_size; ++i)
979 if (!matches[i])
980 {
981 vect_free_oprnd_info (oprnds_info);
982 return false;
983 }
984
985 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986
987 /* Create SLP_TREE nodes for the definition node/s. */
988 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989 {
990 slp_tree child;
991 unsigned old_nloads = loads->length ();
992 unsigned old_max_nunits = *max_nunits;
993
994 if (oprnd_info->first_dt != vect_internal_def)
995 continue;
996
997 if (++this_tree_size > max_tree_size)
998 {
999 vect_free_oprnd_info (oprnds_info);
1000 return false;
1001 }
1002
1003 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004 if (!child)
1005 {
1006 vect_free_oprnd_info (oprnds_info);
1007 return false;
1008 }
1009
1010 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011 group_size, max_nunits, loads,
1012 vectorization_factor, matches,
1013 npermutes, &this_tree_size, max_tree_size))
1014 {
1015 oprnd_info->def_stmts = vNULL;
1016 SLP_TREE_CHILDREN (*node).quick_push (child);
1017 continue;
1018 }
1019
1020 /* If the SLP build for operand zero failed and operand zero
1021 and one can be commutated try that for the scalar stmts
1022 that failed the match. */
1023 if (i == 0
1024 /* A first scalar stmt mismatch signals a fatal mismatch. */
1025 && matches[0]
1026 /* ??? For COND_EXPRs we can swap the comparison operands
1027 as well as the arms under some constraints. */
1028 && nops == 2
1029 && oprnds_info[1]->first_dt == vect_internal_def
1030 && is_gimple_assign (stmt)
1031 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032 /* Do so only if the number of not successful permutes was nor more
1033 than a cut-ff as re-trying the recursive match on
1034 possibly each level of the tree would expose exponential
1035 behavior. */
1036 && *npermutes < 4)
1037 {
1038 unsigned int j;
1039 slp_tree grandchild;
1040
1041 /* Roll back. */
1042 *max_nunits = old_max_nunits;
1043 loads->truncate (old_nloads);
1044 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1045 vect_free_slp_tree (grandchild);
1046 SLP_TREE_CHILDREN (child).truncate (0);
1047
1048 /* Swap mismatched definition stmts. */
1049 dump_printf_loc (MSG_NOTE, vect_location,
1050 "Re-trying with swapped operands of stmts ");
1051 for (j = 0; j < group_size; ++j)
1052 if (!matches[j])
1053 {
1054 gimple tem = oprnds_info[0]->def_stmts[j];
1055 oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1056 oprnds_info[1]->def_stmts[j] = tem;
1057 dump_printf (MSG_NOTE, "%d ", j);
1058 }
1059 dump_printf (MSG_NOTE, "\n");
1060 /* And try again ... */
1061 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1062 group_size, max_nunits, loads,
1063 vectorization_factor,
1064 matches, npermutes, &this_tree_size,
1065 max_tree_size))
1066 {
1067 oprnd_info->def_stmts = vNULL;
1068 SLP_TREE_CHILDREN (*node).quick_push (child);
1069 continue;
1070 }
1071
1072 ++*npermutes;
1073 }
1074
1075 oprnd_info->def_stmts = vNULL;
1076 vect_free_slp_tree (child);
1077 vect_free_oprnd_info (oprnds_info);
1078 return false;
1079 }
1080
1081 if (tree_size)
1082 *tree_size += this_tree_size;
1083
1084 vect_free_oprnd_info (oprnds_info);
1085 return true;
1086 }
1087
1088 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1089
1090 static void
1091 vect_print_slp_tree (int dump_kind, slp_tree node)
1092 {
1093 int i;
1094 gimple stmt;
1095 slp_tree child;
1096
1097 if (!node)
1098 return;
1099
1100 dump_printf (dump_kind, "node ");
1101 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1102 {
1103 dump_printf (dump_kind, "\n\tstmt %d ", i);
1104 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1105 }
1106 dump_printf (dump_kind, "\n");
1107
1108 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1109 vect_print_slp_tree (dump_kind, child);
1110 }
1111
1112
1113 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1114 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1115 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1116 stmts in NODE are to be marked. */
1117
1118 static void
1119 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1120 {
1121 int i;
1122 gimple stmt;
1123 slp_tree child;
1124
1125 if (!node)
1126 return;
1127
1128 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1129 if (j < 0 || i == j)
1130 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1131
1132 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1133 vect_mark_slp_stmts (child, mark, j);
1134 }
1135
1136
1137 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1138
1139 static void
1140 vect_mark_slp_stmts_relevant (slp_tree node)
1141 {
1142 int i;
1143 gimple stmt;
1144 stmt_vec_info stmt_info;
1145 slp_tree child;
1146
1147 if (!node)
1148 return;
1149
1150 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1151 {
1152 stmt_info = vinfo_for_stmt (stmt);
1153 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1154 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1155 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1156 }
1157
1158 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1159 vect_mark_slp_stmts_relevant (child);
1160 }
1161
1162
1163 /* Rearrange the statements of NODE according to PERMUTATION. */
1164
1165 static void
1166 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1167 vec<unsigned> permutation)
1168 {
1169 gimple stmt;
1170 vec<gimple> tmp_stmts;
1171 unsigned int i;
1172 slp_tree child;
1173
1174 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1175 vect_slp_rearrange_stmts (child, group_size, permutation);
1176
1177 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1178 tmp_stmts.create (group_size);
1179 tmp_stmts.quick_grow_cleared (group_size);
1180
1181 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1182 tmp_stmts[permutation[i]] = stmt;
1183
1184 SLP_TREE_SCALAR_STMTS (node).release ();
1185 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1186 }
1187
1188
1189 /* Check if the required load permutations in the SLP instance
1190 SLP_INSTN are supported. */
1191
1192 static bool
1193 vect_supported_load_permutation_p (slp_instance slp_instn)
1194 {
1195 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1196 unsigned int i, j, k, next;
1197 sbitmap load_index;
1198 slp_tree node;
1199 gimple stmt, load, next_load, first_load;
1200 struct data_reference *dr;
1201
1202 if (dump_enabled_p ())
1203 {
1204 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1205 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206 if (node->load_permutation.exists ())
1207 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1208 dump_printf (MSG_NOTE, "%d ", next);
1209 else
1210 for (k = 0; k < group_size; ++k)
1211 dump_printf (MSG_NOTE, "%d ", k);
1212 dump_printf (MSG_NOTE, "\n");
1213 }
1214
1215 /* In case of reduction every load permutation is allowed, since the order
1216 of the reduction statements is not important (as opposed to the case of
1217 grouped stores). The only condition we need to check is that all the
1218 load nodes are of the same size and have the same permutation (and then
1219 rearrange all the nodes of the SLP instance according to this
1220 permutation). */
1221
1222 /* Check that all the load nodes are of the same size. */
1223 /* ??? Can't we assert this? */
1224 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1225 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1226 return false;
1227
1228 node = SLP_INSTANCE_TREE (slp_instn);
1229 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1230
1231 /* Reduction (there are no data-refs in the root).
1232 In reduction chain the order of the loads is important. */
1233 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1234 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1235 {
1236 slp_tree load;
1237 unsigned int lidx;
1238
1239 /* Compare all the permutation sequences to the first one. We know
1240 that at least one load is permuted. */
1241 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1242 if (!node->load_permutation.exists ())
1243 return false;
1244 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1245 {
1246 if (!load->load_permutation.exists ())
1247 return false;
1248 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1249 if (lidx != node->load_permutation[j])
1250 return false;
1251 }
1252
1253 /* Check that the loads in the first sequence are different and there
1254 are no gaps between them. */
1255 load_index = sbitmap_alloc (group_size);
1256 bitmap_clear (load_index);
1257 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1258 {
1259 if (bitmap_bit_p (load_index, lidx))
1260 {
1261 sbitmap_free (load_index);
1262 return false;
1263 }
1264 bitmap_set_bit (load_index, lidx);
1265 }
1266 for (i = 0; i < group_size; i++)
1267 if (!bitmap_bit_p (load_index, i))
1268 {
1269 sbitmap_free (load_index);
1270 return false;
1271 }
1272 sbitmap_free (load_index);
1273
1274 /* This permutation is valid for reduction. Since the order of the
1275 statements in the nodes is not important unless they are memory
1276 accesses, we can rearrange the statements in all the nodes
1277 according to the order of the loads. */
1278 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1279 node->load_permutation);
1280
1281 /* We are done, no actual permutations need to be generated. */
1282 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1283 SLP_TREE_LOAD_PERMUTATION (node).release ();
1284 return true;
1285 }
1286
1287 /* In basic block vectorization we allow any subchain of an interleaving
1288 chain.
1289 FORNOW: not supported in loop SLP because of realignment compications. */
1290 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1291 {
1292 /* Check that for every node in the instance the loads
1293 form a subchain. */
1294 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1295 {
1296 next_load = NULL;
1297 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1298 {
1299 if (j != 0 && next_load != load)
1300 return false;
1301 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1302 }
1303 }
1304
1305 /* Check that the alignment of the first load in every subchain, i.e.,
1306 the first statement in every load node, is supported.
1307 ??? This belongs in alignment checking. */
1308 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309 {
1310 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1311 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1312 {
1313 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1314 if (vect_supportable_dr_alignment (dr, false)
1315 == dr_unaligned_unsupported)
1316 {
1317 if (dump_enabled_p ())
1318 {
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1320 vect_location,
1321 "unsupported unaligned load ");
1322 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1323 first_load, 0);
1324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1325 }
1326 return false;
1327 }
1328 }
1329 }
1330
1331 /* We are done, no actual permutations need to be generated. */
1332 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333 SLP_TREE_LOAD_PERMUTATION (node).release ();
1334 return true;
1335 }
1336
1337 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1338 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1339 well (unless it's reduction). */
1340 if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1341 return false;
1342 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343 if (!node->load_permutation.exists ())
1344 return false;
1345
1346 load_index = sbitmap_alloc (group_size);
1347 bitmap_clear (load_index);
1348 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1349 {
1350 unsigned int lidx = node->load_permutation[0];
1351 if (bitmap_bit_p (load_index, lidx))
1352 {
1353 sbitmap_free (load_index);
1354 return false;
1355 }
1356 bitmap_set_bit (load_index, lidx);
1357 FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1358 if (k != lidx)
1359 {
1360 sbitmap_free (load_index);
1361 return false;
1362 }
1363 }
1364 for (i = 0; i < group_size; i++)
1365 if (!bitmap_bit_p (load_index, i))
1366 {
1367 sbitmap_free (load_index);
1368 return false;
1369 }
1370 sbitmap_free (load_index);
1371
1372 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1373 if (node->load_permutation.exists ()
1374 && !vect_transform_slp_perm_load
1375 (node, vNULL, NULL,
1376 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1377 return false;
1378 return true;
1379 }
1380
1381
1382 /* Find the last store in SLP INSTANCE. */
1383
1384 static gimple
1385 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1386 {
1387 gimple last = NULL, stmt;
1388
1389 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1390 {
1391 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1392 if (is_pattern_stmt_p (stmt_vinfo))
1393 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1394 else
1395 last = get_later_stmt (stmt, last);
1396 }
1397
1398 return last;
1399 }
1400
1401 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1402
1403 static void
1404 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1405 slp_instance instance, slp_tree node,
1406 stmt_vector_for_cost *prologue_cost_vec,
1407 unsigned ncopies_for_cost)
1408 {
1409 stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1410
1411 unsigned i;
1412 slp_tree child;
1413 gimple stmt, s;
1414 stmt_vec_info stmt_info;
1415 tree lhs;
1416 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1417
1418 /* Recurse down the SLP tree. */
1419 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1420 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1421 instance, child, prologue_cost_vec,
1422 ncopies_for_cost);
1423
1424 /* Look at the first scalar stmt to determine the cost. */
1425 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1426 stmt_info = vinfo_for_stmt (stmt);
1427 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1428 {
1429 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1430 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1431 vect_uninitialized_def,
1432 node, prologue_cost_vec, body_cost_vec);
1433 else
1434 {
1435 int i;
1436 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1437 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1438 node, prologue_cost_vec, body_cost_vec);
1439 /* If the load is permuted record the cost for the permutation.
1440 ??? Loads from multiple chains are let through here only
1441 for a single special case involving complex numbers where
1442 in the end no permutation is necessary. */
1443 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1444 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1445 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1446 && vect_get_place_in_interleaving_chain
1447 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1448 {
1449 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1450 stmt_info, 0, vect_body);
1451 break;
1452 }
1453 }
1454 }
1455 else
1456 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1457 stmt_info, 0, vect_body);
1458
1459 /* Scan operands and account for prologue cost of constants/externals.
1460 ??? This over-estimates cost for multiple uses and should be
1461 re-engineered. */
1462 lhs = gimple_get_lhs (stmt);
1463 for (i = 0; i < gimple_num_ops (stmt); ++i)
1464 {
1465 tree def, op = gimple_op (stmt, i);
1466 gimple def_stmt;
1467 enum vect_def_type dt;
1468 if (!op || op == lhs)
1469 continue;
1470 if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1471 &def_stmt, &def, &dt))
1472 {
1473 /* Without looking at the actual initializer a vector of
1474 constants can be implemented as load from the constant pool.
1475 ??? We need to pass down stmt_info for a vector type
1476 even if it points to the wrong stmt. */
1477 if (dt == vect_constant_def)
1478 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1479 stmt_info, 0, vect_prologue);
1480 else if (dt == vect_external_def)
1481 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1482 stmt_info, 0, vect_prologue);
1483 }
1484 }
1485 }
1486
1487 /* Compute the cost for the SLP instance INSTANCE. */
1488
1489 static void
1490 vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1491 slp_instance instance, unsigned nunits)
1492 {
1493 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1494 unsigned ncopies_for_cost;
1495 stmt_info_for_cost *si;
1496 unsigned i;
1497
1498 /* Calculate the number of vector stmts to create based on the unrolling
1499 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1500 GROUP_SIZE / NUNITS otherwise. */
1501 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1502 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1503
1504 prologue_cost_vec.create (10);
1505 body_cost_vec.create (10);
1506 SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1507 vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1508 instance, SLP_INSTANCE_TREE (instance),
1509 &prologue_cost_vec, ncopies_for_cost);
1510
1511 /* Record the prologue costs, which were delayed until we were
1512 sure that SLP was successful. Unlike the body costs, we know
1513 the final values now regardless of the loop vectorization factor. */
1514 void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1515 : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1516 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1517 {
1518 struct _stmt_vec_info *stmt_info
1519 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1520 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1521 si->misalign, vect_prologue);
1522 }
1523
1524 prologue_cost_vec.release ();
1525 }
1526
1527 /* Analyze an SLP instance starting from a group of grouped stores. Call
1528 vect_build_slp_tree to build a tree of packed stmts if possible.
1529 Return FALSE if it's impossible to SLP any stmt in the loop. */
1530
1531 static bool
1532 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1533 gimple stmt, unsigned max_tree_size)
1534 {
1535 slp_instance new_instance;
1536 slp_tree node;
1537 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1538 unsigned int unrolling_factor = 1, nunits;
1539 tree vectype, scalar_type = NULL_TREE;
1540 gimple next;
1541 unsigned int vectorization_factor = 0;
1542 int i;
1543 unsigned int max_nunits = 0;
1544 vec<slp_tree> loads;
1545 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1546 vec<gimple> scalar_stmts;
1547
1548 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1549 {
1550 if (dr)
1551 {
1552 scalar_type = TREE_TYPE (DR_REF (dr));
1553 vectype = get_vectype_for_scalar_type (scalar_type);
1554 }
1555 else
1556 {
1557 gcc_assert (loop_vinfo);
1558 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1559 }
1560
1561 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1562 }
1563 else
1564 {
1565 gcc_assert (loop_vinfo);
1566 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1567 group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1568 }
1569
1570 if (!vectype)
1571 {
1572 if (dump_enabled_p ())
1573 {
1574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575 "Build SLP failed: unsupported data-type ");
1576 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1577 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1578 }
1579
1580 return false;
1581 }
1582
1583 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1584 if (loop_vinfo)
1585 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1586 else
1587 vectorization_factor = nunits;
1588
1589 /* Calculate the unrolling factor. */
1590 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1591 if (unrolling_factor != 1 && !loop_vinfo)
1592 {
1593 if (dump_enabled_p ())
1594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595 "Build SLP failed: unrolling required in basic"
1596 " block SLP\n");
1597
1598 return false;
1599 }
1600
1601 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1602 scalar_stmts.create (group_size);
1603 next = stmt;
1604 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1605 {
1606 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1607 while (next)
1608 {
1609 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1610 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1611 scalar_stmts.safe_push (
1612 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1613 else
1614 scalar_stmts.safe_push (next);
1615 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1616 }
1617 }
1618 else
1619 {
1620 /* Collect reduction statements. */
1621 vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1622 for (i = 0; reductions.iterate (i, &next); i++)
1623 scalar_stmts.safe_push (next);
1624 }
1625
1626 node = vect_create_new_slp_node (scalar_stmts);
1627
1628 loads.create (group_size);
1629
1630 /* Build the tree for the SLP instance. */
1631 bool *matches = XALLOCAVEC (bool, group_size);
1632 unsigned npermutes = 0;
1633 if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1634 &max_nunits, &loads,
1635 vectorization_factor, matches, &npermutes, NULL,
1636 max_tree_size))
1637 {
1638 /* Calculate the unrolling factor based on the smallest type. */
1639 if (max_nunits > nunits)
1640 unrolling_factor = least_common_multiple (max_nunits, group_size)
1641 / group_size;
1642
1643 if (unrolling_factor != 1 && !loop_vinfo)
1644 {
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "Build SLP failed: unrolling required in basic"
1648 " block SLP\n");
1649 vect_free_slp_tree (node);
1650 loads.release ();
1651 return false;
1652 }
1653
1654 /* Create a new SLP instance. */
1655 new_instance = XNEW (struct _slp_instance);
1656 SLP_INSTANCE_TREE (new_instance) = node;
1657 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1658 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1659 SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1660 SLP_INSTANCE_LOADS (new_instance) = loads;
1661
1662 /* Compute the load permutation. */
1663 slp_tree load_node;
1664 bool loads_permuted = false;
1665 FOR_EACH_VEC_ELT (loads, i, load_node)
1666 {
1667 vec<unsigned> load_permutation;
1668 int j;
1669 gimple load, first_stmt;
1670 bool this_load_permuted = false;
1671 load_permutation.create (group_size);
1672 first_stmt = GROUP_FIRST_ELEMENT
1673 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1674 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1675 {
1676 int load_place
1677 = vect_get_place_in_interleaving_chain (load, first_stmt);
1678 gcc_assert (load_place != -1);
1679 if (load_place != j)
1680 this_load_permuted = true;
1681 load_permutation.safe_push (load_place);
1682 }
1683 if (!this_load_permuted)
1684 {
1685 load_permutation.release ();
1686 continue;
1687 }
1688 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1689 loads_permuted = true;
1690 }
1691
1692 if (loads_permuted)
1693 {
1694 if (!vect_supported_load_permutation_p (new_instance))
1695 {
1696 if (dump_enabled_p ())
1697 {
1698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 "Build SLP failed: unsupported load "
1700 "permutation ");
1701 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1702 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1703 }
1704 vect_free_slp_instance (new_instance);
1705 return false;
1706 }
1707 }
1708
1709
1710 if (loop_vinfo)
1711 {
1712 /* Compute the costs of this SLP instance. Delay this for BB
1713 vectorization as we don't have vector types computed yet. */
1714 vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1715 new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1716 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1717 }
1718 else
1719 BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1720
1721 if (dump_enabled_p ())
1722 vect_print_slp_tree (MSG_NOTE, node);
1723
1724 return true;
1725 }
1726
1727 /* Failed to SLP. */
1728 /* Free the allocated memory. */
1729 vect_free_slp_tree (node);
1730 loads.release ();
1731
1732 return false;
1733 }
1734
1735
1736 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1737 trees of packed scalar stmts if SLP is possible. */
1738
1739 bool
1740 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1741 unsigned max_tree_size)
1742 {
1743 unsigned int i;
1744 vec<gimple> grouped_stores;
1745 vec<gimple> reductions = vNULL;
1746 vec<gimple> reduc_chains = vNULL;
1747 gimple first_element;
1748 bool ok = false;
1749
1750 if (dump_enabled_p ())
1751 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1752
1753 if (loop_vinfo)
1754 {
1755 grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1756 reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1757 reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1758 }
1759 else
1760 grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1761
1762 /* Find SLP sequences starting from groups of grouped stores. */
1763 FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1764 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1765 max_tree_size))
1766 ok = true;
1767
1768 if (bb_vinfo && !ok)
1769 {
1770 if (dump_enabled_p ())
1771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772 "Failed to SLP the basic block.\n");
1773
1774 return false;
1775 }
1776
1777 if (loop_vinfo
1778 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1779 {
1780 /* Find SLP sequences starting from reduction chains. */
1781 FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1782 if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1783 max_tree_size))
1784 ok = true;
1785 else
1786 return false;
1787
1788 /* Don't try to vectorize SLP reductions if reduction chain was
1789 detected. */
1790 return ok;
1791 }
1792
1793 /* Find SLP sequences starting from groups of reductions. */
1794 if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1795 && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1796 max_tree_size))
1797 ok = true;
1798
1799 return true;
1800 }
1801
1802
1803 /* For each possible SLP instance decide whether to SLP it and calculate overall
1804 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1805 least one instance. */
1806
1807 bool
1808 vect_make_slp_decision (loop_vec_info loop_vinfo)
1809 {
1810 unsigned int i, unrolling_factor = 1;
1811 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1812 slp_instance instance;
1813 int decided_to_slp = 0;
1814
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1817 "\n");
1818
1819 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1820 {
1821 /* FORNOW: SLP if you can. */
1822 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1823 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1824
1825 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1826 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1827 loop-based vectorization. Such stmts will be marked as HYBRID. */
1828 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1829 decided_to_slp++;
1830 }
1831
1832 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1833
1834 if (decided_to_slp && dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE, vect_location,
1836 "Decided to SLP %d instances. Unrolling factor %d\n",
1837 decided_to_slp, unrolling_factor);
1838
1839 return (decided_to_slp > 0);
1840 }
1841
1842
1843 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1844 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1845
1846 static void
1847 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1848 {
1849 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1850 imm_use_iterator imm_iter;
1851 gimple use_stmt;
1852 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1853 slp_tree child;
1854 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1855 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1856 int j;
1857
1858 /* Propagate hybrid down the SLP tree. */
1859 if (stype == hybrid)
1860 ;
1861 else if (HYBRID_SLP_STMT (stmt_vinfo))
1862 stype = hybrid;
1863 else
1864 {
1865 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1866 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1867 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1868 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1869 if (gimple_bb (use_stmt)
1870 && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1871 && (use_vinfo = vinfo_for_stmt (use_stmt))
1872 && !STMT_SLP_TYPE (use_vinfo)
1873 && (STMT_VINFO_RELEVANT (use_vinfo)
1874 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
1875 || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1876 && STMT_VINFO_RELATED_STMT (use_vinfo)
1877 && !STMT_SLP_TYPE (vinfo_for_stmt
1878 (STMT_VINFO_RELATED_STMT (use_vinfo)))))
1879 && !(gimple_code (use_stmt) == GIMPLE_PHI
1880 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1881 stype = hybrid;
1882 }
1883
1884 if (stype == hybrid)
1885 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1886
1887 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1888 vect_detect_hybrid_slp_stmts (child, i, stype);
1889 }
1890
1891 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
1892
1893 static tree
1894 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1895 {
1896 walk_stmt_info *wi = (walk_stmt_info *)data;
1897 struct loop *loopp = (struct loop *)wi->info;
1898
1899 if (wi->is_lhs)
1900 return NULL_TREE;
1901
1902 if (TREE_CODE (*tp) == SSA_NAME
1903 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1904 {
1905 gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1906 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1907 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1908 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1909 }
1910
1911 return NULL_TREE;
1912 }
1913
1914 static tree
1915 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1916 walk_stmt_info *)
1917 {
1918 /* If the stmt is in a SLP instance then this isn't a reason
1919 to mark use definitions in other SLP instances as hybrid. */
1920 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1921 *handled = true;
1922 return NULL_TREE;
1923 }
1924
1925 /* Find stmts that must be both vectorized and SLPed. */
1926
1927 void
1928 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1929 {
1930 unsigned int i;
1931 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1932 slp_instance instance;
1933
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1936 "\n");
1937
1938 /* First walk all pattern stmt in the loop and mark defs of uses as
1939 hybrid because immediate uses in them are not recorded. */
1940 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1941 {
1942 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1943 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1944 gsi_next (&gsi))
1945 {
1946 gimple stmt = gsi_stmt (gsi);
1947 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1948 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1949 {
1950 walk_stmt_info wi;
1951 memset (&wi, 0, sizeof (wi));
1952 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1953 gimple_stmt_iterator gsi2
1954 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1955 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1956 vect_detect_hybrid_slp_1, &wi);
1957 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1958 vect_detect_hybrid_slp_2,
1959 vect_detect_hybrid_slp_1, &wi);
1960 }
1961 }
1962 }
1963
1964 /* Then walk the SLP instance trees marking stmts with uses in
1965 non-SLP stmts as hybrid, also propagating hybrid down the
1966 SLP tree, collecting the above info on-the-fly. */
1967 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1968 {
1969 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1970 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1971 i, pure_slp);
1972 }
1973 }
1974
1975
1976 /* Create and initialize a new bb_vec_info struct for BB, as well as
1977 stmt_vec_info structs for all the stmts in it. */
1978
1979 static bb_vec_info
1980 new_bb_vec_info (basic_block bb)
1981 {
1982 bb_vec_info res = NULL;
1983 gimple_stmt_iterator gsi;
1984
1985 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
1986 BB_VINFO_BB (res) = bb;
1987
1988 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1989 {
1990 gimple stmt = gsi_stmt (gsi);
1991 gimple_set_uid (stmt, 0);
1992 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
1993 }
1994
1995 BB_VINFO_GROUPED_STORES (res).create (10);
1996 BB_VINFO_SLP_INSTANCES (res).create (2);
1997 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
1998
1999 bb->aux = res;
2000 return res;
2001 }
2002
2003
2004 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2005 stmts in the basic block. */
2006
2007 static void
2008 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2009 {
2010 vec<slp_instance> slp_instances;
2011 slp_instance instance;
2012 basic_block bb;
2013 gimple_stmt_iterator si;
2014 unsigned i;
2015
2016 if (!bb_vinfo)
2017 return;
2018
2019 bb = BB_VINFO_BB (bb_vinfo);
2020
2021 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2022 {
2023 gimple stmt = gsi_stmt (si);
2024 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2025
2026 if (stmt_info)
2027 /* Free stmt_vec_info. */
2028 free_stmt_vec_info (stmt);
2029 }
2030
2031 vect_destroy_datarefs (NULL, bb_vinfo);
2032 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2033 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2034 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2035 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2036 vect_free_slp_instance (instance);
2037 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2038 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2039 free (bb_vinfo);
2040 bb->aux = NULL;
2041 }
2042
2043
2044 /* Analyze statements contained in SLP tree node after recursively analyzing
2045 the subtree. Return TRUE if the operations are supported. */
2046
2047 static bool
2048 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2049 {
2050 bool dummy;
2051 int i;
2052 gimple stmt;
2053 slp_tree child;
2054
2055 if (!node)
2056 return true;
2057
2058 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2059 if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2060 return false;
2061
2062 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2063 {
2064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2065 gcc_assert (stmt_info);
2066 gcc_assert (PURE_SLP_STMT (stmt_info));
2067
2068 if (!vect_analyze_stmt (stmt, &dummy, node))
2069 return false;
2070 }
2071
2072 return true;
2073 }
2074
2075
2076 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2077 operations are supported. */
2078
2079 static bool
2080 vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2081 {
2082 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2083 slp_instance instance;
2084 int i;
2085
2086 for (i = 0; slp_instances.iterate (i, &instance); )
2087 {
2088 if (!vect_slp_analyze_node_operations (bb_vinfo,
2089 SLP_INSTANCE_TREE (instance)))
2090 {
2091 vect_free_slp_instance (instance);
2092 slp_instances.ordered_remove (i);
2093 }
2094 else
2095 i++;
2096 }
2097
2098 if (!slp_instances.length ())
2099 return false;
2100
2101 return true;
2102 }
2103
2104
2105 /* Compute the scalar cost of the SLP node NODE and its children
2106 and return it. Do not account defs that are marked in LIFE and
2107 update LIFE according to uses of NODE. */
2108
2109 static unsigned
2110 vect_bb_slp_scalar_cost (basic_block bb,
2111 slp_tree node, vec<bool, va_heap> *life)
2112 {
2113 unsigned scalar_cost = 0;
2114 unsigned i;
2115 gimple stmt;
2116 slp_tree child;
2117
2118 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2119 {
2120 unsigned stmt_cost;
2121 ssa_op_iter op_iter;
2122 def_operand_p def_p;
2123 stmt_vec_info stmt_info;
2124
2125 if ((*life)[i])
2126 continue;
2127
2128 /* If there is a non-vectorized use of the defs then the scalar
2129 stmt is kept live in which case we do not account it or any
2130 required defs in the SLP children in the scalar cost. This
2131 way we make the vectorization more costly when compared to
2132 the scalar cost. */
2133 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2134 {
2135 imm_use_iterator use_iter;
2136 gimple use_stmt;
2137 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2138 if (!is_gimple_debug (use_stmt)
2139 && (gimple_code (use_stmt) == GIMPLE_PHI
2140 || gimple_bb (use_stmt) != bb
2141 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2142 {
2143 (*life)[i] = true;
2144 BREAK_FROM_IMM_USE_STMT (use_iter);
2145 }
2146 }
2147 if ((*life)[i])
2148 continue;
2149
2150 stmt_info = vinfo_for_stmt (stmt);
2151 if (STMT_VINFO_DATA_REF (stmt_info))
2152 {
2153 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2154 stmt_cost = vect_get_stmt_cost (scalar_load);
2155 else
2156 stmt_cost = vect_get_stmt_cost (scalar_store);
2157 }
2158 else
2159 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2160
2161 scalar_cost += stmt_cost;
2162 }
2163
2164 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2165 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2166
2167 return scalar_cost;
2168 }
2169
2170 /* Check if vectorization of the basic block is profitable. */
2171
2172 static bool
2173 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2174 {
2175 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2176 slp_instance instance;
2177 int i, j;
2178 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2179 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2180 void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2181 stmt_vec_info stmt_info = NULL;
2182 stmt_vector_for_cost body_cost_vec;
2183 stmt_info_for_cost *ci;
2184
2185 /* Calculate vector costs. */
2186 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2187 {
2188 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2189
2190 FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2191 {
2192 stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2193 (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2194 stmt_info, ci->misalign, vect_body);
2195 }
2196 }
2197
2198 /* Calculate scalar cost. */
2199 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2200 {
2201 auto_vec<bool, 20> life;
2202 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2203 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2204 SLP_INSTANCE_TREE (instance),
2205 &life);
2206 }
2207
2208 /* Complete the target-specific cost calculation. */
2209 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2210 &vec_inside_cost, &vec_epilogue_cost);
2211
2212 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2213
2214 if (dump_enabled_p ())
2215 {
2216 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2217 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2218 vec_inside_cost);
2219 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2220 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2221 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2222 }
2223
2224 /* Vectorization is profitable if its cost is less than the cost of scalar
2225 version. */
2226 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2227 return false;
2228
2229 return true;
2230 }
2231
2232 /* Check if the basic block can be vectorized. */
2233
2234 static bb_vec_info
2235 vect_slp_analyze_bb_1 (basic_block bb)
2236 {
2237 bb_vec_info bb_vinfo;
2238 vec<slp_instance> slp_instances;
2239 slp_instance instance;
2240 int i;
2241 int min_vf = 2;
2242 unsigned n_stmts = 0;
2243
2244 bb_vinfo = new_bb_vec_info (bb);
2245 if (!bb_vinfo)
2246 return NULL;
2247
2248 if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2249 {
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "not vectorized: unhandled data-ref in basic "
2253 "block.\n");
2254
2255 destroy_bb_vec_info (bb_vinfo);
2256 return NULL;
2257 }
2258
2259 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2260 {
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "not vectorized: not enough data-refs in "
2264 "basic block.\n");
2265
2266 destroy_bb_vec_info (bb_vinfo);
2267 return NULL;
2268 }
2269
2270 if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2271 {
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274 "not vectorized: unhandled data access in "
2275 "basic block.\n");
2276
2277 destroy_bb_vec_info (bb_vinfo);
2278 return NULL;
2279 }
2280
2281 vect_pattern_recog (NULL, bb_vinfo);
2282
2283 if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2284 {
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "not vectorized: bad data alignment in basic "
2288 "block.\n");
2289
2290 destroy_bb_vec_info (bb_vinfo);
2291 return NULL;
2292 }
2293
2294 /* Check the SLP opportunities in the basic block, analyze and build SLP
2295 trees. */
2296 if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2297 {
2298 if (dump_enabled_p ())
2299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300 "not vectorized: failed to find SLP opportunities "
2301 "in basic block.\n");
2302
2303 destroy_bb_vec_info (bb_vinfo);
2304 return NULL;
2305 }
2306
2307 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2308
2309 /* Mark all the statements that we want to vectorize as pure SLP and
2310 relevant. */
2311 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2312 {
2313 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2314 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2315 }
2316
2317 /* Mark all the statements that we do not want to vectorize. */
2318 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2319 !gsi_end_p (gsi); gsi_next (&gsi))
2320 {
2321 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2322 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2323 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2324 }
2325
2326 /* Analyze dependences. At this point all stmts not participating in
2327 vectorization have to be marked. Dependence analysis assumes
2328 that we either vectorize all SLP instances or none at all. */
2329 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2330 {
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "not vectorized: unhandled data dependence "
2334 "in basic block.\n");
2335
2336 destroy_bb_vec_info (bb_vinfo);
2337 return NULL;
2338 }
2339
2340 if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2341 {
2342 if (dump_enabled_p ())
2343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 "not vectorized: unsupported alignment in basic "
2345 "block.\n");
2346 destroy_bb_vec_info (bb_vinfo);
2347 return NULL;
2348 }
2349
2350 if (!vect_slp_analyze_operations (bb_vinfo))
2351 {
2352 if (dump_enabled_p ())
2353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 "not vectorized: bad operation in basic block.\n");
2355
2356 destroy_bb_vec_info (bb_vinfo);
2357 return NULL;
2358 }
2359
2360 /* Compute the costs of the SLP instances. */
2361 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2362 {
2363 gimple stmt = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2364 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2365 vect_analyze_slp_cost (NULL, bb_vinfo,
2366 instance, TYPE_VECTOR_SUBPARTS (vectype));
2367 }
2368
2369 /* Cost model: check if the vectorization is worthwhile. */
2370 if (!unlimited_cost_model (NULL)
2371 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2372 {
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 "not vectorized: vectorization is not "
2376 "profitable.\n");
2377
2378 destroy_bb_vec_info (bb_vinfo);
2379 return NULL;
2380 }
2381
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_NOTE, vect_location,
2384 "Basic block will be vectorized using SLP\n");
2385
2386 return bb_vinfo;
2387 }
2388
2389
2390 bb_vec_info
2391 vect_slp_analyze_bb (basic_block bb)
2392 {
2393 bb_vec_info bb_vinfo;
2394 int insns = 0;
2395 gimple_stmt_iterator gsi;
2396 unsigned int vector_sizes;
2397
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2400
2401 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2402 {
2403 gimple stmt = gsi_stmt (gsi);
2404 if (!is_gimple_debug (stmt)
2405 && !gimple_nop_p (stmt)
2406 && gimple_code (stmt) != GIMPLE_LABEL)
2407 insns++;
2408 }
2409
2410 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2411 {
2412 if (dump_enabled_p ())
2413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2414 "not vectorized: too many instructions in "
2415 "basic block.\n");
2416
2417 return NULL;
2418 }
2419
2420 /* Autodetect first vector size we try. */
2421 current_vector_size = 0;
2422 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2423
2424 while (1)
2425 {
2426 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2427 if (bb_vinfo)
2428 return bb_vinfo;
2429
2430 destroy_bb_vec_info (bb_vinfo);
2431
2432 vector_sizes &= ~current_vector_size;
2433 if (vector_sizes == 0
2434 || current_vector_size == 0)
2435 return NULL;
2436
2437 /* Try the next biggest vector size. */
2438 current_vector_size = 1 << floor_log2 (vector_sizes);
2439 if (dump_enabled_p ())
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "***** Re-trying analysis with "
2442 "vector size %d\n", current_vector_size);
2443 }
2444 }
2445
2446
2447 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2448 the number of created vector stmts depends on the unrolling factor).
2449 However, the actual number of vector stmts for every SLP node depends on
2450 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2451 should be updated. In this function we assume that the inside costs
2452 calculated in vect_model_xxx_cost are linear in ncopies. */
2453
2454 void
2455 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2456 {
2457 unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2458 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2459 slp_instance instance;
2460 stmt_vector_for_cost body_cost_vec;
2461 stmt_info_for_cost *si;
2462 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2463
2464 if (dump_enabled_p ())
2465 dump_printf_loc (MSG_NOTE, vect_location,
2466 "=== vect_update_slp_costs_according_to_vf ===\n");
2467
2468 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2469 {
2470 /* We assume that costs are linear in ncopies. */
2471 int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2472
2473 /* Record the instance's instructions in the target cost model.
2474 This was delayed until here because the count of instructions
2475 isn't known beforehand. */
2476 body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2477
2478 FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2479 (void) add_stmt_cost (data, si->count * ncopies, si->kind,
2480 vinfo_for_stmt (si->stmt), si->misalign,
2481 vect_body);
2482 }
2483 }
2484
2485
2486 /* For constant and loop invariant defs of SLP_NODE this function returns
2487 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2488 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2489 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2490 REDUC_INDEX is the index of the reduction operand in the statements, unless
2491 it is -1. */
2492
2493 static void
2494 vect_get_constant_vectors (tree op, slp_tree slp_node,
2495 vec<tree> *vec_oprnds,
2496 unsigned int op_num, unsigned int number_of_vectors,
2497 int reduc_index)
2498 {
2499 vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2500 gimple stmt = stmts[0];
2501 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2502 unsigned nunits;
2503 tree vec_cst;
2504 tree *elts;
2505 unsigned j, number_of_places_left_in_vector;
2506 tree vector_type;
2507 tree vop;
2508 int group_size = stmts.length ();
2509 unsigned int vec_num, i;
2510 unsigned number_of_copies = 1;
2511 vec<tree> voprnds;
2512 voprnds.create (number_of_vectors);
2513 bool constant_p, is_store;
2514 tree neutral_op = NULL;
2515 enum tree_code code = gimple_expr_code (stmt);
2516 gimple def_stmt;
2517 struct loop *loop;
2518 gimple_seq ctor_seq = NULL;
2519
2520 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2521 && reduc_index != -1)
2522 {
2523 op_num = reduc_index - 1;
2524 op = gimple_op (stmt, reduc_index);
2525 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2526 we need either neutral operands or the original operands. See
2527 get_initial_def_for_reduction() for details. */
2528 switch (code)
2529 {
2530 case WIDEN_SUM_EXPR:
2531 case DOT_PROD_EXPR:
2532 case PLUS_EXPR:
2533 case MINUS_EXPR:
2534 case BIT_IOR_EXPR:
2535 case BIT_XOR_EXPR:
2536 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2537 neutral_op = build_real (TREE_TYPE (op), dconst0);
2538 else
2539 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2540
2541 break;
2542
2543 case MULT_EXPR:
2544 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2545 neutral_op = build_real (TREE_TYPE (op), dconst1);
2546 else
2547 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2548
2549 break;
2550
2551 case BIT_AND_EXPR:
2552 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2553 break;
2554
2555 /* For MIN/MAX we don't have an easy neutral operand but
2556 the initial values can be used fine here. Only for
2557 a reduction chain we have to force a neutral element. */
2558 case MAX_EXPR:
2559 case MIN_EXPR:
2560 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2561 neutral_op = NULL;
2562 else
2563 {
2564 def_stmt = SSA_NAME_DEF_STMT (op);
2565 loop = (gimple_bb (stmt))->loop_father;
2566 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2567 loop_preheader_edge (loop));
2568 }
2569 break;
2570
2571 default:
2572 neutral_op = NULL;
2573 }
2574 }
2575
2576 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2577 {
2578 is_store = true;
2579 op = gimple_assign_rhs1 (stmt);
2580 }
2581 else
2582 is_store = false;
2583
2584 gcc_assert (op);
2585
2586 if (CONSTANT_CLASS_P (op))
2587 constant_p = true;
2588 else
2589 constant_p = false;
2590
2591 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2592 gcc_assert (vector_type);
2593 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2594
2595 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2596 created vectors. It is greater than 1 if unrolling is performed.
2597
2598 For example, we have two scalar operands, s1 and s2 (e.g., group of
2599 strided accesses of size two), while NUNITS is four (i.e., four scalars
2600 of this type can be packed in a vector). The output vector will contain
2601 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2602 will be 2).
2603
2604 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2605 containing the operands.
2606
2607 For example, NUNITS is four as before, and the group size is 8
2608 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2609 {s5, s6, s7, s8}. */
2610
2611 number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2612
2613 number_of_places_left_in_vector = nunits;
2614 elts = XALLOCAVEC (tree, nunits);
2615 for (j = 0; j < number_of_copies; j++)
2616 {
2617 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2618 {
2619 if (is_store)
2620 op = gimple_assign_rhs1 (stmt);
2621 else
2622 {
2623 switch (code)
2624 {
2625 case COND_EXPR:
2626 if (op_num == 0 || op_num == 1)
2627 {
2628 tree cond = gimple_assign_rhs1 (stmt);
2629 op = TREE_OPERAND (cond, op_num);
2630 }
2631 else
2632 {
2633 if (op_num == 2)
2634 op = gimple_assign_rhs2 (stmt);
2635 else
2636 op = gimple_assign_rhs3 (stmt);
2637 }
2638 break;
2639
2640 case CALL_EXPR:
2641 op = gimple_call_arg (stmt, op_num);
2642 break;
2643
2644 case LSHIFT_EXPR:
2645 case RSHIFT_EXPR:
2646 case LROTATE_EXPR:
2647 case RROTATE_EXPR:
2648 op = gimple_op (stmt, op_num + 1);
2649 /* Unlike the other binary operators, shifts/rotates have
2650 the shift count being int, instead of the same type as
2651 the lhs, so make sure the scalar is the right type if
2652 we are dealing with vectors of
2653 long long/long/short/char. */
2654 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2655 op = fold_convert (TREE_TYPE (vector_type), op);
2656 break;
2657
2658 default:
2659 op = gimple_op (stmt, op_num + 1);
2660 break;
2661 }
2662 }
2663
2664 if (reduc_index != -1)
2665 {
2666 loop = (gimple_bb (stmt))->loop_father;
2667 def_stmt = SSA_NAME_DEF_STMT (op);
2668
2669 gcc_assert (loop);
2670
2671 /* Get the def before the loop. In reduction chain we have only
2672 one initial value. */
2673 if ((j != (number_of_copies - 1)
2674 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2675 && i != 0))
2676 && neutral_op)
2677 op = neutral_op;
2678 else
2679 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2680 loop_preheader_edge (loop));
2681 }
2682
2683 /* Create 'vect_ = {op0,op1,...,opn}'. */
2684 number_of_places_left_in_vector--;
2685 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2686 {
2687 if (CONSTANT_CLASS_P (op))
2688 {
2689 op = fold_unary (VIEW_CONVERT_EXPR,
2690 TREE_TYPE (vector_type), op);
2691 gcc_assert (op && CONSTANT_CLASS_P (op));
2692 }
2693 else
2694 {
2695 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2696 gimple init_stmt;
2697 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2698 init_stmt
2699 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2700 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2701 op = new_temp;
2702 }
2703 }
2704 elts[number_of_places_left_in_vector] = op;
2705 if (!CONSTANT_CLASS_P (op))
2706 constant_p = false;
2707
2708 if (number_of_places_left_in_vector == 0)
2709 {
2710 number_of_places_left_in_vector = nunits;
2711
2712 if (constant_p)
2713 vec_cst = build_vector (vector_type, elts);
2714 else
2715 {
2716 vec<constructor_elt, va_gc> *v;
2717 unsigned k;
2718 vec_alloc (v, nunits);
2719 for (k = 0; k < nunits; ++k)
2720 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2721 vec_cst = build_constructor (vector_type, v);
2722 }
2723 voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2724 vector_type, NULL));
2725 if (ctor_seq != NULL)
2726 {
2727 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2728 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2729 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2730 GSI_SAME_STMT);
2731 ctor_seq = NULL;
2732 }
2733 }
2734 }
2735 }
2736
2737 /* Since the vectors are created in the reverse order, we should invert
2738 them. */
2739 vec_num = voprnds.length ();
2740 for (j = vec_num; j != 0; j--)
2741 {
2742 vop = voprnds[j - 1];
2743 vec_oprnds->quick_push (vop);
2744 }
2745
2746 voprnds.release ();
2747
2748 /* In case that VF is greater than the unrolling factor needed for the SLP
2749 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2750 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2751 to replicate the vectors. */
2752 while (number_of_vectors > vec_oprnds->length ())
2753 {
2754 tree neutral_vec = NULL;
2755
2756 if (neutral_op)
2757 {
2758 if (!neutral_vec)
2759 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2760
2761 vec_oprnds->quick_push (neutral_vec);
2762 }
2763 else
2764 {
2765 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2766 vec_oprnds->quick_push (vop);
2767 }
2768 }
2769 }
2770
2771
2772 /* Get vectorized definitions from SLP_NODE that contains corresponding
2773 vectorized def-stmts. */
2774
2775 static void
2776 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2777 {
2778 tree vec_oprnd;
2779 gimple vec_def_stmt;
2780 unsigned int i;
2781
2782 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2783
2784 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2785 {
2786 gcc_assert (vec_def_stmt);
2787 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2788 vec_oprnds->quick_push (vec_oprnd);
2789 }
2790 }
2791
2792
2793 /* Get vectorized definitions for SLP_NODE.
2794 If the scalar definitions are loop invariants or constants, collect them and
2795 call vect_get_constant_vectors() to create vector stmts.
2796 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2797 must be stored in the corresponding child of SLP_NODE, and we call
2798 vect_get_slp_vect_defs () to retrieve them. */
2799
2800 void
2801 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2802 vec<vec<tree> > *vec_oprnds, int reduc_index)
2803 {
2804 gimple first_stmt;
2805 int number_of_vects = 0, i;
2806 unsigned int child_index = 0;
2807 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2808 slp_tree child = NULL;
2809 vec<tree> vec_defs;
2810 tree oprnd;
2811 bool vectorized_defs;
2812
2813 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2814 FOR_EACH_VEC_ELT (ops, i, oprnd)
2815 {
2816 /* For each operand we check if it has vectorized definitions in a child
2817 node or we need to create them (for invariants and constants). We
2818 check if the LHS of the first stmt of the next child matches OPRND.
2819 If it does, we found the correct child. Otherwise, we call
2820 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2821 to check this child node for the next operand. */
2822 vectorized_defs = false;
2823 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2824 {
2825 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2826
2827 /* We have to check both pattern and original def, if available. */
2828 gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2829 gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2830
2831 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2832 || (related
2833 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2834 {
2835 /* The number of vector defs is determined by the number of
2836 vector statements in the node from which we get those
2837 statements. */
2838 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2839 vectorized_defs = true;
2840 child_index++;
2841 }
2842 }
2843
2844 if (!vectorized_defs)
2845 {
2846 if (i == 0)
2847 {
2848 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2849 /* Number of vector stmts was calculated according to LHS in
2850 vect_schedule_slp_instance (), fix it by replacing LHS with
2851 RHS, if necessary. See vect_get_smallest_scalar_type () for
2852 details. */
2853 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2854 &rhs_size_unit);
2855 if (rhs_size_unit != lhs_size_unit)
2856 {
2857 number_of_vects *= rhs_size_unit;
2858 number_of_vects /= lhs_size_unit;
2859 }
2860 }
2861 }
2862
2863 /* Allocate memory for vectorized defs. */
2864 vec_defs = vNULL;
2865 vec_defs.create (number_of_vects);
2866
2867 /* For reduction defs we call vect_get_constant_vectors (), since we are
2868 looking for initial loop invariant values. */
2869 if (vectorized_defs && reduc_index == -1)
2870 /* The defs are already vectorized. */
2871 vect_get_slp_vect_defs (child, &vec_defs);
2872 else
2873 /* Build vectors from scalar defs. */
2874 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2875 number_of_vects, reduc_index);
2876
2877 vec_oprnds->quick_push (vec_defs);
2878
2879 /* For reductions, we only need initial values. */
2880 if (reduc_index != -1)
2881 return;
2882 }
2883 }
2884
2885
2886 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2887 building a vector of type MASK_TYPE from it) and two input vectors placed in
2888 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2889 shifting by STRIDE elements of DR_CHAIN for every copy.
2890 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2891 copies).
2892 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2893 the created stmts must be inserted. */
2894
2895 static inline void
2896 vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2897 tree mask, int first_vec_indx, int second_vec_indx,
2898 gimple_stmt_iterator *gsi, slp_tree node,
2899 tree vectype, vec<tree> dr_chain,
2900 int ncopies, int vect_stmts_counter)
2901 {
2902 tree perm_dest;
2903 gimple perm_stmt = NULL;
2904 stmt_vec_info next_stmt_info;
2905 int i, stride;
2906 tree first_vec, second_vec, data_ref;
2907
2908 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2909
2910 /* Initialize the vect stmts of NODE to properly insert the generated
2911 stmts later. */
2912 for (i = SLP_TREE_VEC_STMTS (node).length ();
2913 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2914 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2915
2916 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2917 for (i = 0; i < ncopies; i++)
2918 {
2919 first_vec = dr_chain[first_vec_indx];
2920 second_vec = dr_chain[second_vec_indx];
2921
2922 /* Generate the permute statement. */
2923 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2924 first_vec, second_vec, mask);
2925 data_ref = make_ssa_name (perm_dest, perm_stmt);
2926 gimple_set_lhs (perm_stmt, data_ref);
2927 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2928
2929 /* Store the vector statement in NODE. */
2930 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2931
2932 first_vec_indx += stride;
2933 second_vec_indx += stride;
2934 }
2935
2936 /* Mark the scalar stmt as vectorized. */
2937 next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2938 STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2939 }
2940
2941
2942 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2943 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2944 representation. Check that the mask is valid and return FALSE if not.
2945 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2946 the next vector, i.e., the current first vector is not needed. */
2947
2948 static bool
2949 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2950 int mask_nunits, bool only_one_vec, int index,
2951 unsigned char *mask, int *current_mask_element,
2952 bool *need_next_vector, int *number_of_mask_fixes,
2953 bool *mask_fixed, bool *needs_first_vector)
2954 {
2955 int i;
2956
2957 /* Convert to target specific representation. */
2958 *current_mask_element = first_mask_element + m;
2959 /* Adjust the value in case it's a mask for second and third vectors. */
2960 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2961
2962 if (*current_mask_element < mask_nunits)
2963 *needs_first_vector = true;
2964
2965 /* We have only one input vector to permute but the mask accesses values in
2966 the next vector as well. */
2967 if (only_one_vec && *current_mask_element >= mask_nunits)
2968 {
2969 if (dump_enabled_p ())
2970 {
2971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972 "permutation requires at least two vectors ");
2973 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2974 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2975 }
2976
2977 return false;
2978 }
2979
2980 /* The mask requires the next vector. */
2981 while (*current_mask_element >= mask_nunits * 2)
2982 {
2983 if (*needs_first_vector || *mask_fixed)
2984 {
2985 /* We either need the first vector too or have already moved to the
2986 next vector. In both cases, this permutation needs three
2987 vectors. */
2988 if (dump_enabled_p ())
2989 {
2990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991 "permutation requires at "
2992 "least three vectors ");
2993 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2994 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2995 }
2996
2997 return false;
2998 }
2999
3000 /* We move to the next vector, dropping the first one and working with
3001 the second and the third - we need to adjust the values of the mask
3002 accordingly. */
3003 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3004
3005 for (i = 0; i < index; i++)
3006 mask[i] -= mask_nunits * *number_of_mask_fixes;
3007
3008 (*number_of_mask_fixes)++;
3009 *mask_fixed = true;
3010 }
3011
3012 *need_next_vector = *mask_fixed;
3013
3014 /* This was the last element of this mask. Start a new one. */
3015 if (index == mask_nunits - 1)
3016 {
3017 *number_of_mask_fixes = 1;
3018 *mask_fixed = false;
3019 *needs_first_vector = false;
3020 }
3021
3022 return true;
3023 }
3024
3025
3026 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3027 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3028 permute statements for the SLP node NODE of the SLP instance
3029 SLP_NODE_INSTANCE. */
3030
3031 bool
3032 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3033 gimple_stmt_iterator *gsi, int vf,
3034 slp_instance slp_node_instance, bool analyze_only)
3035 {
3036 gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3037 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3038 tree mask_element_type = NULL_TREE, mask_type;
3039 int i, j, k, nunits, vec_index = 0, scalar_index;
3040 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3041 gimple next_scalar_stmt;
3042 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3043 int first_mask_element;
3044 int index, unroll_factor, current_mask_element, ncopies;
3045 unsigned char *mask;
3046 bool only_one_vec = false, need_next_vector = false;
3047 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3048 int number_of_mask_fixes = 1;
3049 bool mask_fixed = false;
3050 bool needs_first_vector = false;
3051 machine_mode mode;
3052
3053 mode = TYPE_MODE (vectype);
3054
3055 if (!can_vec_perm_p (mode, false, NULL))
3056 {
3057 if (dump_enabled_p ())
3058 {
3059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060 "no vect permute for ");
3061 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3062 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3063 }
3064 return false;
3065 }
3066
3067 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3068 same size as the vector element being permuted. */
3069 mask_element_type = lang_hooks.types.type_for_mode
3070 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3071 mask_type = get_vectype_for_scalar_type (mask_element_type);
3072 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3073 mask = XALLOCAVEC (unsigned char, nunits);
3074 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3075
3076 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3077 unrolling factor. */
3078 orig_vec_stmts_num = group_size *
3079 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3080 if (orig_vec_stmts_num == 1)
3081 only_one_vec = true;
3082
3083 /* Number of copies is determined by the final vectorization factor
3084 relatively to SLP_NODE_INSTANCE unrolling factor. */
3085 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3086
3087 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3088 return false;
3089
3090 /* Generate permutation masks for every NODE. Number of masks for each NODE
3091 is equal to GROUP_SIZE.
3092 E.g., we have a group of three nodes with three loads from the same
3093 location in each node, and the vector size is 4. I.e., we have a
3094 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3095 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3096 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3097 ...
3098
3099 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3100 The last mask is illegal since we assume two operands for permute
3101 operation, and the mask element values can't be outside that range.
3102 Hence, the last mask must be converted into {2,5,5,5}.
3103 For the first two permutations we need the first and the second input
3104 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3105 we need the second and the third vectors: {b1,c1,a2,b2} and
3106 {c2,a3,b3,c3}. */
3107
3108 {
3109 scalar_index = 0;
3110 index = 0;
3111 vect_stmts_counter = 0;
3112 vec_index = 0;
3113 first_vec_index = vec_index++;
3114 if (only_one_vec)
3115 second_vec_index = first_vec_index;
3116 else
3117 second_vec_index = vec_index++;
3118
3119 for (j = 0; j < unroll_factor; j++)
3120 {
3121 for (k = 0; k < group_size; k++)
3122 {
3123 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3124 first_mask_element = i + j * group_size;
3125 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3126 nunits, only_one_vec, index,
3127 mask, &current_mask_element,
3128 &need_next_vector,
3129 &number_of_mask_fixes, &mask_fixed,
3130 &needs_first_vector))
3131 return false;
3132 gcc_assert (current_mask_element < 2 * nunits);
3133 mask[index++] = current_mask_element;
3134
3135 if (index == nunits)
3136 {
3137 index = 0;
3138 if (!can_vec_perm_p (mode, false, mask))
3139 {
3140 if (dump_enabled_p ())
3141 {
3142 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3143 vect_location,
3144 "unsupported vect permute { ");
3145 for (i = 0; i < nunits; ++i)
3146 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3147 mask[i]);
3148 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3149 }
3150 return false;
3151 }
3152
3153 if (!analyze_only)
3154 {
3155 int l;
3156 tree mask_vec, *mask_elts;
3157 mask_elts = XALLOCAVEC (tree, nunits);
3158 for (l = 0; l < nunits; ++l)
3159 mask_elts[l] = build_int_cst (mask_element_type,
3160 mask[l]);
3161 mask_vec = build_vector (mask_type, mask_elts);
3162
3163 if (need_next_vector)
3164 {
3165 first_vec_index = second_vec_index;
3166 second_vec_index = vec_index;
3167 }
3168
3169 next_scalar_stmt
3170 = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3171
3172 vect_create_mask_and_perm (stmt, next_scalar_stmt,
3173 mask_vec, first_vec_index, second_vec_index,
3174 gsi, node, vectype, dr_chain,
3175 ncopies, vect_stmts_counter++);
3176 }
3177 }
3178 }
3179 }
3180 }
3181
3182 return true;
3183 }
3184
3185
3186
3187 /* Vectorize SLP instance tree in postorder. */
3188
3189 static bool
3190 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3191 unsigned int vectorization_factor)
3192 {
3193 gimple stmt;
3194 bool grouped_store, is_store;
3195 gimple_stmt_iterator si;
3196 stmt_vec_info stmt_info;
3197 unsigned int vec_stmts_size, nunits, group_size;
3198 tree vectype;
3199 int i;
3200 slp_tree child;
3201
3202 if (!node)
3203 return false;
3204
3205 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3206 vect_schedule_slp_instance (child, instance, vectorization_factor);
3207
3208 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3209 stmt_info = vinfo_for_stmt (stmt);
3210
3211 /* VECTYPE is the type of the destination. */
3212 vectype = STMT_VINFO_VECTYPE (stmt_info);
3213 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3214 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3215
3216 /* For each SLP instance calculate number of vector stmts to be created
3217 for the scalar stmts in each node of the SLP tree. Number of vector
3218 elements in one vector iteration is the number of scalar elements in
3219 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3220 size. */
3221 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3222
3223 if (!SLP_TREE_VEC_STMTS (node).exists ())
3224 {
3225 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3226 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3227 }
3228
3229 if (dump_enabled_p ())
3230 {
3231 dump_printf_loc (MSG_NOTE,vect_location,
3232 "------>vectorizing SLP node starting from: ");
3233 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3234 dump_printf (MSG_NOTE, "\n");
3235 }
3236
3237 /* Vectorized stmts go before the last scalar stmt which is where
3238 all uses are ready. */
3239 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3240
3241 /* Mark the first element of the reduction chain as reduction to properly
3242 transform the node. In the analysis phase only the last element of the
3243 chain is marked as reduction. */
3244 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3245 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3246 {
3247 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3248 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3249 }
3250
3251 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3252 return is_store;
3253 }
3254
3255 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3256 For loop vectorization this is done in vectorizable_call, but for SLP
3257 it needs to be deferred until end of vect_schedule_slp, because multiple
3258 SLP instances may refer to the same scalar stmt. */
3259
3260 static void
3261 vect_remove_slp_scalar_calls (slp_tree node)
3262 {
3263 gimple stmt, new_stmt;
3264 gimple_stmt_iterator gsi;
3265 int i;
3266 slp_tree child;
3267 tree lhs;
3268 stmt_vec_info stmt_info;
3269
3270 if (!node)
3271 return;
3272
3273 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3274 vect_remove_slp_scalar_calls (child);
3275
3276 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3277 {
3278 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3279 continue;
3280 stmt_info = vinfo_for_stmt (stmt);
3281 if (stmt_info == NULL
3282 || is_pattern_stmt_p (stmt_info)
3283 || !PURE_SLP_STMT (stmt_info))
3284 continue;
3285 lhs = gimple_call_lhs (stmt);
3286 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3287 set_vinfo_for_stmt (new_stmt, stmt_info);
3288 set_vinfo_for_stmt (stmt, NULL);
3289 STMT_VINFO_STMT (stmt_info) = new_stmt;
3290 gsi = gsi_for_stmt (stmt);
3291 gsi_replace (&gsi, new_stmt, false);
3292 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3293 }
3294 }
3295
3296 /* Generate vector code for all SLP instances in the loop/basic block. */
3297
3298 bool
3299 vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3300 {
3301 vec<slp_instance> slp_instances;
3302 slp_instance instance;
3303 unsigned int i, vf;
3304 bool is_store = false;
3305
3306 if (loop_vinfo)
3307 {
3308 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3309 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3310 }
3311 else
3312 {
3313 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3314 vf = 1;
3315 }
3316
3317 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3318 {
3319 /* Schedule the tree of INSTANCE. */
3320 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3321 instance, vf);
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location,
3324 "vectorizing stmts using SLP.\n");
3325 }
3326
3327 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3328 {
3329 slp_tree root = SLP_INSTANCE_TREE (instance);
3330 gimple store;
3331 unsigned int j;
3332 gimple_stmt_iterator gsi;
3333
3334 /* Remove scalar call stmts. Do not do this for basic-block
3335 vectorization as not all uses may be vectorized.
3336 ??? Why should this be necessary? DCE should be able to
3337 remove the stmts itself.
3338 ??? For BB vectorization we can as well remove scalar
3339 stmts starting from the SLP tree root if they have no
3340 uses. */
3341 if (loop_vinfo)
3342 vect_remove_slp_scalar_calls (root);
3343
3344 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3345 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3346 {
3347 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3348 break;
3349
3350 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3351 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3352 /* Free the attached stmt_vec_info and remove the stmt. */
3353 gsi = gsi_for_stmt (store);
3354 unlink_stmt_vdef (store);
3355 gsi_remove (&gsi, true);
3356 release_defs (store);
3357 free_stmt_vec_info (store);
3358 }
3359 }
3360
3361 return is_store;
3362 }
3363
3364
3365 /* Vectorize the basic block. */
3366
3367 void
3368 vect_slp_transform_bb (basic_block bb)
3369 {
3370 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3371 gimple_stmt_iterator si;
3372
3373 gcc_assert (bb_vinfo);
3374
3375 if (dump_enabled_p ())
3376 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3377
3378 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3379 {
3380 gimple stmt = gsi_stmt (si);
3381 stmt_vec_info stmt_info;
3382
3383 if (dump_enabled_p ())
3384 {
3385 dump_printf_loc (MSG_NOTE, vect_location,
3386 "------>SLPing statement: ");
3387 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3388 dump_printf (MSG_NOTE, "\n");
3389 }
3390
3391 stmt_info = vinfo_for_stmt (stmt);
3392 gcc_assert (stmt_info);
3393
3394 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3395 if (STMT_SLP_TYPE (stmt_info))
3396 {
3397 vect_schedule_slp (NULL, bb_vinfo);
3398 break;
3399 }
3400 }
3401
3402 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "BASIC BLOCK VECTORIZED\n");
3405
3406 destroy_bb_vec_info (bb_vinfo);
3407 }