tree-optimization/97466 - remove spurious assert
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 /* Initialize a SLP node. */
56
57 _slp_tree::_slp_tree ()
58 {
59 SLP_TREE_SCALAR_STMTS (this) = vNULL;
60 SLP_TREE_SCALAR_OPS (this) = vNULL;
61 SLP_TREE_VEC_STMTS (this) = vNULL;
62 SLP_TREE_VEC_DEFS (this) = vNULL;
63 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
64 SLP_TREE_CHILDREN (this) = vNULL;
65 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
66 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
67 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
68 SLP_TREE_CODE (this) = ERROR_MARK;
69 SLP_TREE_VECTYPE (this) = NULL_TREE;
70 SLP_TREE_REPRESENTATIVE (this) = NULL;
71 SLP_TREE_REF_COUNT (this) = 1;
72 this->max_nunits = 1;
73 this->lanes = 0;
74 }
75
76 /* Tear down a SLP node. */
77
78 _slp_tree::~_slp_tree ()
79 {
80 SLP_TREE_CHILDREN (this).release ();
81 SLP_TREE_SCALAR_STMTS (this).release ();
82 SLP_TREE_SCALAR_OPS (this).release ();
83 SLP_TREE_VEC_STMTS (this).release ();
84 SLP_TREE_VEC_DEFS (this).release ();
85 SLP_TREE_LOAD_PERMUTATION (this).release ();
86 SLP_TREE_LANE_PERMUTATION (this).release ();
87 }
88
89 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
90
91 static void
92 vect_free_slp_tree (slp_tree node)
93 {
94 int i;
95 slp_tree child;
96
97 if (--SLP_TREE_REF_COUNT (node) != 0)
98 return;
99
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child);
102
103 delete node;
104 }
105
106 /* Return a location suitable for dumpings related to the SLP instance. */
107
108 dump_user_location_t
109 _slp_instance::location () const
110 {
111 if (root_stmt)
112 return root_stmt->stmt;
113 else
114 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
115 }
116
117
118 /* Free the memory allocated for the SLP instance. */
119
120 void
121 vect_free_slp_instance (slp_instance instance)
122 {
123 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
124 SLP_INSTANCE_LOADS (instance).release ();
125 instance->subgraph_entries.release ();
126 instance->cost_vec.release ();
127 free (instance);
128 }
129
130
131 /* Create an SLP node for SCALAR_STMTS. */
132
133 static slp_tree
134 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
135 {
136 slp_tree node = new _slp_tree;
137 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
138 SLP_TREE_CHILDREN (node).create (nops);
139 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
140 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
141 SLP_TREE_LANES (node) = scalar_stmts.length ();
142 return node;
143 }
144
145 /* Create an SLP node for OPS. */
146
147 static slp_tree
148 vect_create_new_slp_node (vec<tree> ops)
149 {
150 slp_tree node = new _slp_tree;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_DEF_TYPE (node) = vect_external_def;
153 SLP_TREE_LANES (node) = ops.length ();
154 return node;
155 }
156
157
158 /* This structure is used in creation of an SLP tree. Each instance
159 corresponds to the same operand in a group of scalar stmts in an SLP
160 node. */
161 typedef struct _slp_oprnd_info
162 {
163 /* Def-stmts for the operands. */
164 vec<stmt_vec_info> def_stmts;
165 /* Operands. */
166 vec<tree> ops;
167 /* Information about the first statement, its vector def-type, type, the
168 operand itself in case it's constant, and an indication if it's a pattern
169 stmt. */
170 tree first_op_type;
171 enum vect_def_type first_dt;
172 bool any_pattern;
173 } *slp_oprnd_info;
174
175
176 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
177 operand. */
178 static vec<slp_oprnd_info>
179 vect_create_oprnd_info (int nops, int group_size)
180 {
181 int i;
182 slp_oprnd_info oprnd_info;
183 vec<slp_oprnd_info> oprnds_info;
184
185 oprnds_info.create (nops);
186 for (i = 0; i < nops; i++)
187 {
188 oprnd_info = XNEW (struct _slp_oprnd_info);
189 oprnd_info->def_stmts.create (group_size);
190 oprnd_info->ops.create (group_size);
191 oprnd_info->first_dt = vect_uninitialized_def;
192 oprnd_info->first_op_type = NULL_TREE;
193 oprnd_info->any_pattern = false;
194 oprnds_info.quick_push (oprnd_info);
195 }
196
197 return oprnds_info;
198 }
199
200
201 /* Free operands info. */
202
203 static void
204 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
205 {
206 int i;
207 slp_oprnd_info oprnd_info;
208
209 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
210 {
211 oprnd_info->def_stmts.release ();
212 oprnd_info->ops.release ();
213 XDELETE (oprnd_info);
214 }
215
216 oprnds_info.release ();
217 }
218
219
220 /* Return true if STMTS contains a pattern statement. */
221
222 static bool
223 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
224 {
225 stmt_vec_info stmt_info;
226 unsigned int i;
227 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
228 if (is_pattern_stmt_p (stmt_info))
229 return true;
230 return false;
231 }
232
233 /* Return true when all lanes in the external or constant NODE have
234 the same value. */
235
236 static bool
237 vect_slp_tree_uniform_p (slp_tree node)
238 {
239 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
240 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
241
242 /* Pre-exsting vectors. */
243 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
244 return false;
245
246 unsigned i;
247 tree op, first = NULL_TREE;
248 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
249 if (!first)
250 first = op;
251 else if (!operand_equal_p (first, op, 0))
252 return false;
253
254 return true;
255 }
256
257 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
258 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
259 of the chain. */
260
261 int
262 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
263 stmt_vec_info first_stmt_info)
264 {
265 stmt_vec_info next_stmt_info = first_stmt_info;
266 int result = 0;
267
268 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
269 return -1;
270
271 do
272 {
273 if (next_stmt_info == stmt_info)
274 return result;
275 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
276 if (next_stmt_info)
277 result += DR_GROUP_GAP (next_stmt_info);
278 }
279 while (next_stmt_info);
280
281 return -1;
282 }
283
284 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
285 using the method implemented by duplicate_and_interleave. Return true
286 if so, returning the number of intermediate vectors in *NVECTORS_OUT
287 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
288 (if nonnull). */
289
290 bool
291 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
292 tree elt_type, unsigned int *nvectors_out,
293 tree *vector_type_out,
294 tree *permutes)
295 {
296 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
297 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
298 return false;
299
300 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
301 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
302 unsigned int nvectors = 1;
303 for (;;)
304 {
305 scalar_int_mode int_mode;
306 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
307 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
308 {
309 /* Get the natural vector type for this SLP group size. */
310 tree int_type = build_nonstandard_integer_type
311 (GET_MODE_BITSIZE (int_mode), 1);
312 tree vector_type
313 = get_vectype_for_scalar_type (vinfo, int_type, count);
314 if (vector_type
315 && VECTOR_MODE_P (TYPE_MODE (vector_type))
316 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
317 GET_MODE_SIZE (base_vector_mode)))
318 {
319 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
320 together into elements of type INT_TYPE and using the result
321 to build NVECTORS vectors. */
322 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
323 vec_perm_builder sel1 (nelts, 2, 3);
324 vec_perm_builder sel2 (nelts, 2, 3);
325 poly_int64 half_nelts = exact_div (nelts, 2);
326 for (unsigned int i = 0; i < 3; ++i)
327 {
328 sel1.quick_push (i);
329 sel1.quick_push (i + nelts);
330 sel2.quick_push (half_nelts + i);
331 sel2.quick_push (half_nelts + i + nelts);
332 }
333 vec_perm_indices indices1 (sel1, 2, nelts);
334 vec_perm_indices indices2 (sel2, 2, nelts);
335 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
336 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
337 {
338 if (nvectors_out)
339 *nvectors_out = nvectors;
340 if (vector_type_out)
341 *vector_type_out = vector_type;
342 if (permutes)
343 {
344 permutes[0] = vect_gen_perm_mask_checked (vector_type,
345 indices1);
346 permutes[1] = vect_gen_perm_mask_checked (vector_type,
347 indices2);
348 }
349 return true;
350 }
351 }
352 }
353 if (!multiple_p (elt_bytes, 2, &elt_bytes))
354 return false;
355 nvectors *= 2;
356 }
357 }
358
359 /* Return true if DTA and DTB match. */
360
361 static bool
362 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
363 {
364 return (dta == dtb
365 || ((dta == vect_external_def || dta == vect_constant_def)
366 && (dtb == vect_external_def || dtb == vect_constant_def)));
367 }
368
369 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
370 they are of a valid type and that they match the defs of the first stmt of
371 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
372 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
373 indicates swap is required for cond_expr stmts. Specifically, *SWAP
374 is 1 if STMT is cond and operands of comparison need to be swapped;
375 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
376 If there is any operand swap in this function, *SWAP is set to non-zero
377 value.
378 If there was a fatal error return -1; if the error could be corrected by
379 swapping operands of father node of this one, return 1; if everything is
380 ok return 0. */
381 static int
382 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
383 vec<stmt_vec_info> stmts, unsigned stmt_num,
384 vec<slp_oprnd_info> *oprnds_info)
385 {
386 stmt_vec_info stmt_info = stmts[stmt_num];
387 tree oprnd;
388 unsigned int i, number_of_oprnds;
389 enum vect_def_type dt = vect_uninitialized_def;
390 slp_oprnd_info oprnd_info;
391 int first_op_idx = 1;
392 unsigned int commutative_op = -1U;
393 bool first_op_cond = false;
394 bool first = stmt_num == 0;
395
396 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
397 {
398 number_of_oprnds = gimple_call_num_args (stmt);
399 first_op_idx = 3;
400 if (gimple_call_internal_p (stmt))
401 {
402 internal_fn ifn = gimple_call_internal_fn (stmt);
403 commutative_op = first_commutative_argument (ifn);
404
405 /* Masked load, only look at mask. */
406 if (ifn == IFN_MASK_LOAD)
407 {
408 number_of_oprnds = 1;
409 /* Mask operand index. */
410 first_op_idx = 5;
411 }
412 }
413 }
414 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
415 {
416 enum tree_code code = gimple_assign_rhs_code (stmt);
417 number_of_oprnds = gimple_num_ops (stmt) - 1;
418 /* Swap can only be done for cond_expr if asked to, otherwise we
419 could result in different comparison code to the first stmt. */
420 if (code == COND_EXPR
421 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
422 {
423 first_op_cond = true;
424 number_of_oprnds++;
425 }
426 else
427 commutative_op = commutative_tree_code (code) ? 0U : -1U;
428 }
429 else
430 return -1;
431
432 bool swapped = (swap != 0);
433 gcc_assert (!swapped || first_op_cond);
434 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
435 for (i = 0; i < number_of_oprnds; i++)
436 {
437 if (first_op_cond)
438 {
439 /* Map indicating how operands of cond_expr should be swapped. */
440 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
441 int *map = maps[swap];
442
443 if (i < 2)
444 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
445 first_op_idx), map[i]);
446 else
447 oprnd = gimple_op (stmt_info->stmt, map[i]);
448 }
449 else
450 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
451 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
452 oprnd = TREE_OPERAND (oprnd, 0);
453
454 oprnd_info = (*oprnds_info)[i];
455
456 stmt_vec_info def_stmt_info;
457 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
458 {
459 if (dump_enabled_p ())
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
461 "Build SLP failed: can't analyze def for %T\n",
462 oprnd);
463
464 return -1;
465 }
466
467 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
468 oprnd_info->any_pattern = true;
469
470 oprnd_info->def_stmts.quick_push (def_stmt_info);
471 oprnd_info->ops.quick_push (oprnd);
472
473 if (first)
474 {
475 tree type = TREE_TYPE (oprnd);
476 dt = dts[i];
477 if ((dt == vect_constant_def
478 || dt == vect_external_def)
479 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
480 && (TREE_CODE (type) == BOOLEAN_TYPE
481 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
482 type)))
483 {
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
486 "Build SLP failed: invalid type of def "
487 "for variable-length SLP %T\n", oprnd);
488 return -1;
489 }
490
491 /* For the swapping logic below force vect_reduction_def
492 for the reduction op in a SLP reduction group. */
493 if (!STMT_VINFO_DATA_REF (stmt_info)
494 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
495 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
496 && def_stmt_info)
497 dts[i] = dt = vect_reduction_def;
498
499 /* Check the types of the definition. */
500 switch (dt)
501 {
502 case vect_external_def:
503 case vect_constant_def:
504 case vect_internal_def:
505 case vect_reduction_def:
506 case vect_induction_def:
507 break;
508
509 default:
510 /* FORNOW: Not supported. */
511 if (dump_enabled_p ())
512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
513 "Build SLP failed: illegal type of def %T\n",
514 oprnd);
515 return -1;
516 }
517
518 oprnd_info->first_dt = dt;
519 oprnd_info->first_op_type = type;
520 }
521 }
522 if (first)
523 return 0;
524
525 /* Now match the operand definition types to that of the first stmt. */
526 for (i = 0; i < number_of_oprnds;)
527 {
528 oprnd_info = (*oprnds_info)[i];
529 dt = dts[i];
530 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
531 oprnd = oprnd_info->ops[stmt_num];
532 tree type = TREE_TYPE (oprnd);
533
534 if (!types_compatible_p (oprnd_info->first_op_type, type))
535 {
536 if (dump_enabled_p ())
537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
538 "Build SLP failed: different operand types\n");
539 return 1;
540 }
541
542 /* Not first stmt of the group, check that the def-stmt/s match
543 the def-stmt/s of the first stmt. Allow different definition
544 types for reduction chains: the first stmt must be a
545 vect_reduction_def (a phi node), and the rest
546 end in the reduction chain. */
547 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
548 && !(oprnd_info->first_dt == vect_reduction_def
549 && !STMT_VINFO_DATA_REF (stmt_info)
550 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
551 && def_stmt_info
552 && !STMT_VINFO_DATA_REF (def_stmt_info)
553 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
554 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
555 || (!STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && ((!def_stmt_info
558 || STMT_VINFO_DATA_REF (def_stmt_info)
559 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
560 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
561 != (oprnd_info->first_dt != vect_reduction_def))))
562 {
563 /* Try swapping operands if we got a mismatch. For BB
564 vectorization only in case it will clearly improve things. */
565 if (i == commutative_op && !swapped
566 && (!is_a <bb_vec_info> (vinfo)
567 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
568 dts[i+1])
569 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
570 || vect_def_types_match
571 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
572 {
573 if (dump_enabled_p ())
574 dump_printf_loc (MSG_NOTE, vect_location,
575 "trying swapped operands\n");
576 std::swap (dts[i], dts[i+1]);
577 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
578 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
579 std::swap ((*oprnds_info)[i]->ops[stmt_num],
580 (*oprnds_info)[i+1]->ops[stmt_num]);
581 swapped = true;
582 continue;
583 }
584
585 if (is_a <bb_vec_info> (vinfo))
586 {
587 /* Now for commutative ops we should see whether we can
588 make the other operand matching. */
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
591 "treating operand as external\n");
592 oprnd_info->first_dt = dt = vect_external_def;
593 }
594 else
595 {
596 if (dump_enabled_p ())
597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
598 "Build SLP failed: different types\n");
599 return 1;
600 }
601 }
602
603 /* Make sure to demote the overall operand to external. */
604 if (dt == vect_external_def)
605 oprnd_info->first_dt = vect_external_def;
606 /* For a SLP reduction chain we want to duplicate the reduction to
607 each of the chain members. That gets us a sane SLP graph (still
608 the stmts are not 100% correct wrt the initial values). */
609 else if ((dt == vect_internal_def
610 || dt == vect_reduction_def)
611 && oprnd_info->first_dt == vect_reduction_def
612 && !STMT_VINFO_DATA_REF (stmt_info)
613 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
614 && !STMT_VINFO_DATA_REF (def_stmt_info)
615 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
616 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
617 {
618 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
619 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
620 }
621
622 ++i;
623 }
624
625 /* Swap operands. */
626 if (swapped)
627 {
628 if (dump_enabled_p ())
629 dump_printf_loc (MSG_NOTE, vect_location,
630 "swapped operands to match def types in %G",
631 stmt_info->stmt);
632 }
633
634 return 0;
635 }
636
637 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
638 Return true if we can, meaning that this choice doesn't conflict with
639 existing SLP nodes that use STMT_INFO. */
640
641 bool
642 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
643 {
644 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
645 if (old_vectype)
646 return useless_type_conversion_p (vectype, old_vectype);
647
648 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
649 {
650 /* We maintain the invariant that if any statement in the group is
651 used, all other members of the group have the same vector type. */
652 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
653 stmt_vec_info member_info = first_info;
654 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
655 if (is_pattern_stmt_p (member_info)
656 && !useless_type_conversion_p (vectype,
657 STMT_VINFO_VECTYPE (member_info)))
658 break;
659
660 if (!member_info)
661 {
662 for (member_info = first_info; member_info;
663 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
664 STMT_VINFO_VECTYPE (member_info) = vectype;
665 return true;
666 }
667 }
668 else if (!is_pattern_stmt_p (stmt_info))
669 {
670 STMT_VINFO_VECTYPE (stmt_info) = vectype;
671 return true;
672 }
673
674 if (dump_enabled_p ())
675 {
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Build SLP failed: incompatible vector"
678 " types for: %G", stmt_info->stmt);
679 dump_printf_loc (MSG_NOTE, vect_location,
680 " old vector type: %T\n", old_vectype);
681 dump_printf_loc (MSG_NOTE, vect_location,
682 " new vector type: %T\n", vectype);
683 }
684 return false;
685 }
686
687 /* Return true if call statements CALL1 and CALL2 are similar enough
688 to be combined into the same SLP group. */
689
690 static bool
691 compatible_calls_p (gcall *call1, gcall *call2)
692 {
693 unsigned int nargs = gimple_call_num_args (call1);
694 if (nargs != gimple_call_num_args (call2))
695 return false;
696
697 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
698 return false;
699
700 if (gimple_call_internal_p (call1))
701 {
702 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
703 TREE_TYPE (gimple_call_lhs (call2))))
704 return false;
705 for (unsigned int i = 0; i < nargs; ++i)
706 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
707 TREE_TYPE (gimple_call_arg (call2, i))))
708 return false;
709 }
710 else
711 {
712 if (!operand_equal_p (gimple_call_fn (call1),
713 gimple_call_fn (call2), 0))
714 return false;
715
716 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
717 return false;
718 }
719 return true;
720 }
721
722 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
723 caller's attempt to find the vector type in STMT_INFO with the narrowest
724 element type. Return true if VECTYPE is nonnull and if it is valid
725 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
726 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
727 vect_build_slp_tree. */
728
729 static bool
730 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
731 unsigned int group_size,
732 tree vectype, poly_uint64 *max_nunits)
733 {
734 if (!vectype)
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: unsupported data-type in %G\n",
739 stmt_info->stmt);
740 /* Fatal mismatch. */
741 return false;
742 }
743
744 /* If populating the vector type requires unrolling then fail
745 before adjusting *max_nunits for basic-block vectorization. */
746 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
747 unsigned HOST_WIDE_INT const_nunits;
748 if (is_a <bb_vec_info> (vinfo)
749 && (!nunits.is_constant (&const_nunits)
750 || const_nunits > group_size))
751 {
752 if (dump_enabled_p ())
753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 "Build SLP failed: unrolling required "
755 "in basic block SLP\n");
756 /* Fatal mismatch. */
757 return false;
758 }
759
760 /* In case of multiple types we need to detect the smallest type. */
761 vect_update_max_nunits (max_nunits, vectype);
762 return true;
763 }
764
765 /* Verify if the scalar stmts STMTS are isomorphic, require data
766 permutation or are of unsupported types of operation. Return
767 true if they are, otherwise return false and indicate in *MATCHES
768 which stmts are not isomorphic to the first one. If MATCHES[0]
769 is false then this indicates the comparison could not be
770 carried out or the stmts will never be vectorized by SLP.
771
772 Note COND_EXPR is possibly isomorphic to another one after swapping its
773 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
774 the first stmt by swapping the two operands of comparison; set SWAP[i]
775 to 2 if stmt I is isormorphic to the first stmt by inverting the code
776 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
777 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
778
779 static bool
780 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
781 vec<stmt_vec_info> stmts, unsigned int group_size,
782 poly_uint64 *max_nunits, bool *matches,
783 bool *two_operators, tree *node_vectype)
784 {
785 unsigned int i;
786 stmt_vec_info first_stmt_info = stmts[0];
787 enum tree_code first_stmt_code = ERROR_MARK;
788 enum tree_code alt_stmt_code = ERROR_MARK;
789 enum tree_code rhs_code = ERROR_MARK;
790 enum tree_code first_cond_code = ERROR_MARK;
791 tree lhs;
792 bool need_same_oprnds = false;
793 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
794 optab optab;
795 int icode;
796 machine_mode optab_op2_mode;
797 machine_mode vec_mode;
798 stmt_vec_info first_load = NULL, prev_first_load = NULL;
799 bool first_stmt_load_p = false, load_p = false;
800
801 /* For every stmt in NODE find its def stmt/s. */
802 stmt_vec_info stmt_info;
803 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
804 {
805 gimple *stmt = stmt_info->stmt;
806 swap[i] = 0;
807 matches[i] = false;
808
809 if (dump_enabled_p ())
810 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
811
812 /* Fail to vectorize statements marked as unvectorizable, throw
813 or are volatile. */
814 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
815 || stmt_can_throw_internal (cfun, stmt)
816 || gimple_has_volatile_ops (stmt))
817 {
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
820 "Build SLP failed: unvectorizable statement %G",
821 stmt);
822 /* ??? For BB vectorization we want to commutate operands in a way
823 to shuffle all unvectorizable defs into one operand and have
824 the other still vectorized. The following doesn't reliably
825 work for this though but it's the easiest we can do here. */
826 if (is_a <bb_vec_info> (vinfo) && i != 0)
827 continue;
828 /* Fatal mismatch. */
829 matches[0] = false;
830 return false;
831 }
832
833 lhs = gimple_get_lhs (stmt);
834 if (lhs == NULL_TREE)
835 {
836 if (dump_enabled_p ())
837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
838 "Build SLP failed: not GIMPLE_ASSIGN nor "
839 "GIMPLE_CALL %G", stmt);
840 if (is_a <bb_vec_info> (vinfo) && i != 0)
841 continue;
842 /* Fatal mismatch. */
843 matches[0] = false;
844 return false;
845 }
846
847 tree nunits_vectype;
848 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
849 &nunits_vectype, group_size)
850 || (nunits_vectype
851 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
852 nunits_vectype, max_nunits)))
853 {
854 if (is_a <bb_vec_info> (vinfo) && i != 0)
855 continue;
856 /* Fatal mismatch. */
857 matches[0] = false;
858 return false;
859 }
860
861 gcc_assert (vectype);
862
863 gcall *call_stmt = dyn_cast <gcall *> (stmt);
864 if (call_stmt)
865 {
866 rhs_code = CALL_EXPR;
867
868 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
869 load_p = true;
870 else if ((gimple_call_internal_p (call_stmt)
871 && (!vectorizable_internal_fn_p
872 (gimple_call_internal_fn (call_stmt))))
873 || gimple_call_tail_p (call_stmt)
874 || gimple_call_noreturn_p (call_stmt)
875 || !gimple_call_nothrow_p (call_stmt)
876 || gimple_call_chain (call_stmt))
877 {
878 if (dump_enabled_p ())
879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
880 "Build SLP failed: unsupported call type %G",
881 call_stmt);
882 if (is_a <bb_vec_info> (vinfo) && i != 0)
883 continue;
884 /* Fatal mismatch. */
885 matches[0] = false;
886 return false;
887 }
888 }
889 else
890 {
891 rhs_code = gimple_assign_rhs_code (stmt);
892 load_p = gimple_vuse (stmt);
893 }
894
895 /* Check the operation. */
896 if (i == 0)
897 {
898 *node_vectype = vectype;
899 first_stmt_code = rhs_code;
900 first_stmt_load_p = load_p;
901
902 /* Shift arguments should be equal in all the packed stmts for a
903 vector shift with scalar shift operand. */
904 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
905 || rhs_code == LROTATE_EXPR
906 || rhs_code == RROTATE_EXPR)
907 {
908 vec_mode = TYPE_MODE (vectype);
909
910 /* First see if we have a vector/vector shift. */
911 optab = optab_for_tree_code (rhs_code, vectype,
912 optab_vector);
913
914 if (!optab
915 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
916 {
917 /* No vector/vector shift, try for a vector/scalar shift. */
918 optab = optab_for_tree_code (rhs_code, vectype,
919 optab_scalar);
920
921 if (!optab)
922 {
923 if (dump_enabled_p ())
924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
925 "Build SLP failed: no optab.\n");
926 if (is_a <bb_vec_info> (vinfo) && i != 0)
927 continue;
928 /* Fatal mismatch. */
929 matches[0] = false;
930 return false;
931 }
932 icode = (int) optab_handler (optab, vec_mode);
933 if (icode == CODE_FOR_nothing)
934 {
935 if (dump_enabled_p ())
936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
937 "Build SLP failed: "
938 "op not supported by target.\n");
939 if (is_a <bb_vec_info> (vinfo) && i != 0)
940 continue;
941 /* Fatal mismatch. */
942 matches[0] = false;
943 return false;
944 }
945 optab_op2_mode = insn_data[icode].operand[2].mode;
946 if (!VECTOR_MODE_P (optab_op2_mode))
947 {
948 need_same_oprnds = true;
949 first_op1 = gimple_assign_rhs2 (stmt);
950 }
951 }
952 }
953 else if (rhs_code == WIDEN_LSHIFT_EXPR)
954 {
955 need_same_oprnds = true;
956 first_op1 = gimple_assign_rhs2 (stmt);
957 }
958 else if (!load_p
959 && rhs_code == BIT_FIELD_REF)
960 {
961 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
962 if (TREE_CODE (vec) != SSA_NAME
963 || !types_compatible_p (vectype, TREE_TYPE (vec)))
964 {
965 if (is_a <bb_vec_info> (vinfo) && i != 0)
966 continue;
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
969 "Build SLP failed: "
970 "BIT_FIELD_REF not supported\n");
971 /* Fatal mismatch. */
972 matches[0] = false;
973 return false;
974 }
975 }
976 else if (call_stmt
977 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
978 {
979 need_same_oprnds = true;
980 first_op1 = gimple_call_arg (call_stmt, 1);
981 }
982 }
983 else
984 {
985 if (first_stmt_code != rhs_code
986 && alt_stmt_code == ERROR_MARK)
987 alt_stmt_code = rhs_code;
988 if ((first_stmt_code != rhs_code
989 && (first_stmt_code != IMAGPART_EXPR
990 || rhs_code != REALPART_EXPR)
991 && (first_stmt_code != REALPART_EXPR
992 || rhs_code != IMAGPART_EXPR)
993 /* Handle mismatches in plus/minus by computing both
994 and merging the results. */
995 && !((first_stmt_code == PLUS_EXPR
996 || first_stmt_code == MINUS_EXPR)
997 && (alt_stmt_code == PLUS_EXPR
998 || alt_stmt_code == MINUS_EXPR)
999 && rhs_code == alt_stmt_code)
1000 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1001 && (first_stmt_code == ARRAY_REF
1002 || first_stmt_code == BIT_FIELD_REF
1003 || first_stmt_code == INDIRECT_REF
1004 || first_stmt_code == COMPONENT_REF
1005 || first_stmt_code == MEM_REF)))
1006 || first_stmt_load_p != load_p)
1007 {
1008 if (dump_enabled_p ())
1009 {
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Build SLP failed: different operation "
1012 "in stmt %G", stmt);
1013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1014 "original stmt %G", first_stmt_info->stmt);
1015 }
1016 /* Mismatch. */
1017 continue;
1018 }
1019
1020 if (need_same_oprnds)
1021 {
1022 tree other_op1 = (call_stmt
1023 ? gimple_call_arg (call_stmt, 1)
1024 : gimple_assign_rhs2 (stmt));
1025 if (!operand_equal_p (first_op1, other_op1, 0))
1026 {
1027 if (dump_enabled_p ())
1028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1029 "Build SLP failed: different shift "
1030 "arguments in %G", stmt);
1031 /* Mismatch. */
1032 continue;
1033 }
1034 }
1035 if (!load_p
1036 && first_stmt_code == BIT_FIELD_REF
1037 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1038 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1039 {
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "Build SLP failed: different BIT_FIELD_REF "
1043 "arguments in %G", stmt);
1044 /* Mismatch. */
1045 continue;
1046 }
1047
1048 if (!load_p && rhs_code == CALL_EXPR)
1049 {
1050 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1051 as_a <gcall *> (stmt)))
1052 {
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "Build SLP failed: different calls in %G",
1056 stmt);
1057 /* Mismatch. */
1058 continue;
1059 }
1060 }
1061
1062 if (!types_compatible_p (vectype, *node_vectype))
1063 {
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Build SLP failed: different vector type "
1067 "in %G", stmt);
1068 /* Mismatch. */
1069 continue;
1070 }
1071 }
1072
1073 /* Grouped store or load. */
1074 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1075 {
1076 if (REFERENCE_CLASS_P (lhs))
1077 {
1078 /* Store. */
1079 ;
1080 }
1081 else
1082 {
1083 /* Load. */
1084 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1085 if (prev_first_load)
1086 {
1087 /* Check that there are no loads from different interleaving
1088 chains in the same node. */
1089 if (prev_first_load != first_load)
1090 {
1091 if (dump_enabled_p ())
1092 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1093 vect_location,
1094 "Build SLP failed: different "
1095 "interleaving chains in one node %G",
1096 stmt);
1097 /* Mismatch. */
1098 continue;
1099 }
1100 }
1101 else
1102 prev_first_load = first_load;
1103 }
1104 } /* Grouped access. */
1105 else
1106 {
1107 if (load_p)
1108 {
1109 /* Not grouped load. */
1110 if (dump_enabled_p ())
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 "Build SLP failed: not grouped load %G", stmt);
1113
1114 /* FORNOW: Not grouped loads are not supported. */
1115 if (is_a <bb_vec_info> (vinfo) && i != 0)
1116 continue;
1117 /* Fatal mismatch. */
1118 matches[0] = false;
1119 return false;
1120 }
1121
1122 /* Not memory operation. */
1123 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1124 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1125 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1126 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1127 && rhs_code != VIEW_CONVERT_EXPR
1128 && rhs_code != CALL_EXPR
1129 && rhs_code != BIT_FIELD_REF)
1130 {
1131 if (dump_enabled_p ())
1132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133 "Build SLP failed: operation unsupported %G",
1134 stmt);
1135 if (is_a <bb_vec_info> (vinfo) && i != 0)
1136 continue;
1137 /* Fatal mismatch. */
1138 matches[0] = false;
1139 return false;
1140 }
1141
1142 if (rhs_code == COND_EXPR)
1143 {
1144 tree cond_expr = gimple_assign_rhs1 (stmt);
1145 enum tree_code cond_code = TREE_CODE (cond_expr);
1146 enum tree_code swap_code = ERROR_MARK;
1147 enum tree_code invert_code = ERROR_MARK;
1148
1149 if (i == 0)
1150 first_cond_code = TREE_CODE (cond_expr);
1151 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1152 {
1153 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1154 swap_code = swap_tree_comparison (cond_code);
1155 invert_code = invert_tree_comparison (cond_code, honor_nans);
1156 }
1157
1158 if (first_cond_code == cond_code)
1159 ;
1160 /* Isomorphic can be achieved by swapping. */
1161 else if (first_cond_code == swap_code)
1162 swap[i] = 1;
1163 /* Isomorphic can be achieved by inverting. */
1164 else if (first_cond_code == invert_code)
1165 swap[i] = 2;
1166 else
1167 {
1168 if (dump_enabled_p ())
1169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170 "Build SLP failed: different"
1171 " operation %G", stmt);
1172 /* Mismatch. */
1173 continue;
1174 }
1175 }
1176 }
1177
1178 matches[i] = true;
1179 }
1180
1181 for (i = 0; i < group_size; ++i)
1182 if (!matches[i])
1183 return false;
1184
1185 /* If we allowed a two-operation SLP node verify the target can cope
1186 with the permute we are going to use. */
1187 if (alt_stmt_code != ERROR_MARK
1188 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1189 {
1190 *two_operators = true;
1191 }
1192
1193 return true;
1194 }
1195
1196 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1197 Note we never remove apart from at destruction time so we do not
1198 need a special value for deleted that differs from empty. */
1199 struct bst_traits
1200 {
1201 typedef vec <stmt_vec_info> value_type;
1202 typedef vec <stmt_vec_info> compare_type;
1203 static inline hashval_t hash (value_type);
1204 static inline bool equal (value_type existing, value_type candidate);
1205 static inline bool is_empty (value_type x) { return !x.exists (); }
1206 static inline bool is_deleted (value_type x) { return !x.exists (); }
1207 static const bool empty_zero_p = true;
1208 static inline void mark_empty (value_type &x) { x.release (); }
1209 static inline void mark_deleted (value_type &x) { x.release (); }
1210 static inline void remove (value_type &x) { x.release (); }
1211 };
1212 inline hashval_t
1213 bst_traits::hash (value_type x)
1214 {
1215 inchash::hash h;
1216 for (unsigned i = 0; i < x.length (); ++i)
1217 h.add_int (gimple_uid (x[i]->stmt));
1218 return h.end ();
1219 }
1220 inline bool
1221 bst_traits::equal (value_type existing, value_type candidate)
1222 {
1223 if (existing.length () != candidate.length ())
1224 return false;
1225 for (unsigned i = 0; i < existing.length (); ++i)
1226 if (existing[i] != candidate[i])
1227 return false;
1228 return true;
1229 }
1230
1231 typedef hash_map <vec <gimple *>, slp_tree,
1232 simple_hashmap_traits <bst_traits, slp_tree> >
1233 scalar_stmts_to_slp_tree_map_t;
1234
1235 static slp_tree
1236 vect_build_slp_tree_2 (vec_info *vinfo,
1237 vec<stmt_vec_info> stmts, unsigned int group_size,
1238 poly_uint64 *max_nunits,
1239 bool *matches, unsigned *npermutes, unsigned *tree_size,
1240 scalar_stmts_to_slp_tree_map_t *bst_map);
1241
1242 static slp_tree
1243 vect_build_slp_tree (vec_info *vinfo,
1244 vec<stmt_vec_info> stmts, unsigned int group_size,
1245 poly_uint64 *max_nunits,
1246 bool *matches, unsigned *npermutes, unsigned *tree_size,
1247 scalar_stmts_to_slp_tree_map_t *bst_map)
1248 {
1249 if (slp_tree *leader = bst_map->get (stmts))
1250 {
1251 if (dump_enabled_p ())
1252 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1253 *leader ? "" : "failed ", *leader);
1254 if (*leader)
1255 {
1256 SLP_TREE_REF_COUNT (*leader)++;
1257 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1258 }
1259 return *leader;
1260 }
1261 poly_uint64 this_max_nunits = 1;
1262 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1263 &this_max_nunits,
1264 matches, npermutes, tree_size, bst_map);
1265 if (res)
1266 {
1267 res->max_nunits = this_max_nunits;
1268 vect_update_max_nunits (max_nunits, this_max_nunits);
1269 /* Keep a reference for the bst_map use. */
1270 SLP_TREE_REF_COUNT (res)++;
1271 }
1272 bst_map->put (stmts.copy (), res);
1273 return res;
1274 }
1275
1276 /* Recursively build an SLP tree starting from NODE.
1277 Fail (and return a value not equal to zero) if def-stmts are not
1278 isomorphic, require data permutation or are of unsupported types of
1279 operation. Otherwise, return 0.
1280 The value returned is the depth in the SLP tree where a mismatch
1281 was found. */
1282
1283 static slp_tree
1284 vect_build_slp_tree_2 (vec_info *vinfo,
1285 vec<stmt_vec_info> stmts, unsigned int group_size,
1286 poly_uint64 *max_nunits,
1287 bool *matches, unsigned *npermutes, unsigned *tree_size,
1288 scalar_stmts_to_slp_tree_map_t *bst_map)
1289 {
1290 unsigned nops, i, this_tree_size = 0;
1291 poly_uint64 this_max_nunits = *max_nunits;
1292 slp_tree node;
1293
1294 matches[0] = false;
1295
1296 stmt_vec_info stmt_info = stmts[0];
1297 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1298 nops = gimple_call_num_args (stmt);
1299 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1300 {
1301 nops = gimple_num_ops (stmt) - 1;
1302 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1303 nops++;
1304 }
1305 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1306 nops = gimple_phi_num_args (phi);
1307 else
1308 return NULL;
1309
1310 /* If the SLP node is a PHI (induction or reduction), terminate
1311 the recursion. */
1312 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1313 {
1314 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1315 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1316 group_size);
1317 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1318 max_nunits))
1319 return NULL;
1320
1321 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1322 /* Induction from different IVs is not supported. */
1323 if (def_type == vect_induction_def)
1324 {
1325 stmt_vec_info other_info;
1326 FOR_EACH_VEC_ELT (stmts, i, other_info)
1327 if (stmt_info != other_info)
1328 return NULL;
1329 }
1330 else if (def_type == vect_reduction_def
1331 || def_type == vect_double_reduction_def
1332 || def_type == vect_nested_cycle)
1333 {
1334 /* Else def types have to match. */
1335 stmt_vec_info other_info;
1336 FOR_EACH_VEC_ELT (stmts, i, other_info)
1337 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1338 return NULL;
1339 }
1340 else
1341 return NULL;
1342 (*tree_size)++;
1343 node = vect_create_new_slp_node (stmts, nops);
1344 SLP_TREE_VECTYPE (node) = vectype;
1345 return node;
1346 }
1347
1348
1349 bool two_operators = false;
1350 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1351 tree vectype = NULL_TREE;
1352 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1353 &this_max_nunits, matches, &two_operators,
1354 &vectype))
1355 return NULL;
1356
1357 /* If the SLP node is a load, terminate the recursion unless masked. */
1358 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1359 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1360 {
1361 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1362 {
1363 /* Masked load. */
1364 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1365 nops = 1;
1366 }
1367 else
1368 {
1369 *max_nunits = this_max_nunits;
1370 (*tree_size)++;
1371 node = vect_create_new_slp_node (stmts, 0);
1372 SLP_TREE_VECTYPE (node) = vectype;
1373 /* And compute the load permutation. Whether it is actually
1374 a permutation depends on the unrolling factor which is
1375 decided later. */
1376 vec<unsigned> load_permutation;
1377 int j;
1378 stmt_vec_info load_info;
1379 load_permutation.create (group_size);
1380 stmt_vec_info first_stmt_info
1381 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1382 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1383 {
1384 int load_place = vect_get_place_in_interleaving_chain
1385 (load_info, first_stmt_info);
1386 gcc_assert (load_place != -1);
1387 load_permutation.safe_push (load_place);
1388 }
1389 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1390 return node;
1391 }
1392 }
1393 else if (gimple_assign_single_p (stmt_info->stmt)
1394 && !gimple_vuse (stmt_info->stmt)
1395 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1396 {
1397 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1398 the same SSA name vector of a compatible type to vectype. */
1399 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1400 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1401 stmt_vec_info estmt_info;
1402 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1403 {
1404 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1405 tree bfref = gimple_assign_rhs1 (estmt);
1406 HOST_WIDE_INT lane;
1407 if (!known_eq (bit_field_size (bfref),
1408 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1409 || !constant_multiple_p (bit_field_offset (bfref),
1410 bit_field_size (bfref), &lane))
1411 {
1412 lperm.release ();
1413 return NULL;
1414 }
1415 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1416 }
1417 slp_tree vnode = vect_create_new_slp_node (vNULL);
1418 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1419 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1420 /* We are always building a permutation node even if it is an identity
1421 permute to shield the rest of the vectorizer from the odd node
1422 representing an actual vector without any scalar ops.
1423 ??? We could hide it completely with making the permute node
1424 external? */
1425 node = vect_create_new_slp_node (stmts, 1);
1426 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1427 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1428 SLP_TREE_VECTYPE (node) = vectype;
1429 SLP_TREE_CHILDREN (node).quick_push (vnode);
1430 return node;
1431 }
1432
1433 /* Get at the operands, verifying they are compatible. */
1434 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1435 slp_oprnd_info oprnd_info;
1436 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1437 {
1438 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1439 stmts, i, &oprnds_info);
1440 if (res != 0)
1441 matches[(res == -1) ? 0 : i] = false;
1442 if (!matches[0])
1443 break;
1444 }
1445 for (i = 0; i < group_size; ++i)
1446 if (!matches[i])
1447 {
1448 vect_free_oprnd_info (oprnds_info);
1449 return NULL;
1450 }
1451 swap = NULL;
1452
1453 auto_vec<slp_tree, 4> children;
1454
1455 stmt_info = stmts[0];
1456
1457 /* Create SLP_TREE nodes for the definition node/s. */
1458 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1459 {
1460 slp_tree child;
1461 unsigned int j;
1462
1463 if (oprnd_info->first_dt == vect_uninitialized_def)
1464 {
1465 /* COND_EXPR have one too many eventually if the condition
1466 is a SSA name. */
1467 gcc_assert (i == 3 && nops == 4);
1468 continue;
1469 }
1470
1471 if (oprnd_info->first_dt != vect_internal_def
1472 && oprnd_info->first_dt != vect_reduction_def
1473 && oprnd_info->first_dt != vect_induction_def)
1474 {
1475 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1476 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1477 oprnd_info->ops = vNULL;
1478 children.safe_push (invnode);
1479 continue;
1480 }
1481
1482 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1483 group_size, &this_max_nunits,
1484 matches, npermutes,
1485 &this_tree_size, bst_map)) != NULL)
1486 {
1487 oprnd_info->def_stmts = vNULL;
1488 children.safe_push (child);
1489 continue;
1490 }
1491
1492 /* If the SLP build for operand zero failed and operand zero
1493 and one can be commutated try that for the scalar stmts
1494 that failed the match. */
1495 if (i == 0
1496 /* A first scalar stmt mismatch signals a fatal mismatch. */
1497 && matches[0]
1498 /* ??? For COND_EXPRs we can swap the comparison operands
1499 as well as the arms under some constraints. */
1500 && nops == 2
1501 && oprnds_info[1]->first_dt == vect_internal_def
1502 && is_gimple_assign (stmt_info->stmt)
1503 /* Swapping operands for reductions breaks assumptions later on. */
1504 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1505 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1506 /* Do so only if the number of not successful permutes was nor more
1507 than a cut-ff as re-trying the recursive match on
1508 possibly each level of the tree would expose exponential
1509 behavior. */
1510 && *npermutes < 4)
1511 {
1512 /* See whether we can swap the matching or the non-matching
1513 stmt operands. */
1514 bool swap_not_matching = true;
1515 do
1516 {
1517 for (j = 0; j < group_size; ++j)
1518 {
1519 if (matches[j] != !swap_not_matching)
1520 continue;
1521 stmt_vec_info stmt_info = stmts[j];
1522 /* Verify if we can swap operands of this stmt. */
1523 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1524 if (!stmt
1525 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1526 {
1527 if (!swap_not_matching)
1528 goto fail;
1529 swap_not_matching = false;
1530 break;
1531 }
1532 }
1533 }
1534 while (j != group_size);
1535
1536 /* Swap mismatched definition stmts. */
1537 if (dump_enabled_p ())
1538 dump_printf_loc (MSG_NOTE, vect_location,
1539 "Re-trying with swapped operands of stmts ");
1540 for (j = 0; j < group_size; ++j)
1541 if (matches[j] == !swap_not_matching)
1542 {
1543 std::swap (oprnds_info[0]->def_stmts[j],
1544 oprnds_info[1]->def_stmts[j]);
1545 std::swap (oprnds_info[0]->ops[j],
1546 oprnds_info[1]->ops[j]);
1547 if (dump_enabled_p ())
1548 dump_printf (MSG_NOTE, "%d ", j);
1549 }
1550 if (dump_enabled_p ())
1551 dump_printf (MSG_NOTE, "\n");
1552 /* And try again with scratch 'matches' ... */
1553 bool *tem = XALLOCAVEC (bool, group_size);
1554 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1555 group_size, &this_max_nunits,
1556 tem, npermutes,
1557 &this_tree_size, bst_map)) != NULL)
1558 {
1559 oprnd_info->def_stmts = vNULL;
1560 children.safe_push (child);
1561 continue;
1562 }
1563 /* We do not undo the swapping here since it might still be
1564 the better order for the second operand in case we build
1565 the first one from scalars below. */
1566 ++*npermutes;
1567 }
1568 fail:
1569
1570 /* If the SLP build failed and we analyze a basic-block
1571 simply treat nodes we fail to build as externally defined
1572 (and thus build vectors from the scalar defs).
1573 The cost model will reject outright expensive cases.
1574 ??? This doesn't treat cases where permutation ultimatively
1575 fails (or we don't try permutation below). Ideally we'd
1576 even compute a permutation that will end up with the maximum
1577 SLP tree size... */
1578 if (is_a <bb_vec_info> (vinfo)
1579 /* ??? Rejecting patterns this way doesn't work. We'd have to
1580 do extra work to cancel the pattern so the uses see the
1581 scalar version. */
1582 && !is_pattern_stmt_p (stmt_info)
1583 && !oprnd_info->any_pattern)
1584 {
1585 /* But if there's a leading vector sized set of matching stmts
1586 fail here so we can split the group. This matches the condition
1587 vect_analyze_slp_instance uses. */
1588 /* ??? We might want to split here and combine the results to support
1589 multiple vector sizes better. */
1590 for (j = 0; j < group_size; ++j)
1591 if (!matches[j])
1592 break;
1593 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1594 {
1595 if (dump_enabled_p ())
1596 dump_printf_loc (MSG_NOTE, vect_location,
1597 "Building vector operands from scalars\n");
1598 this_tree_size++;
1599 child = vect_create_new_slp_node (oprnd_info->ops);
1600 children.safe_push (child);
1601 oprnd_info->ops = vNULL;
1602 oprnd_info->def_stmts = vNULL;
1603 continue;
1604 }
1605 }
1606
1607 gcc_assert (child == NULL);
1608 FOR_EACH_VEC_ELT (children, j, child)
1609 vect_free_slp_tree (child);
1610 vect_free_oprnd_info (oprnds_info);
1611 return NULL;
1612 }
1613
1614 vect_free_oprnd_info (oprnds_info);
1615
1616 /* If we have all children of a child built up from uniform scalars
1617 or does more than one possibly expensive vector construction then
1618 just throw that away, causing it built up from scalars.
1619 The exception is the SLP node for the vector store. */
1620 if (is_a <bb_vec_info> (vinfo)
1621 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1622 /* ??? Rejecting patterns this way doesn't work. We'd have to
1623 do extra work to cancel the pattern so the uses see the
1624 scalar version. */
1625 && !is_pattern_stmt_p (stmt_info))
1626 {
1627 slp_tree child;
1628 unsigned j;
1629 bool all_uniform_p = true;
1630 unsigned n_vector_builds = 0;
1631 FOR_EACH_VEC_ELT (children, j, child)
1632 {
1633 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1634 all_uniform_p = false;
1635 else if (!vect_slp_tree_uniform_p (child))
1636 {
1637 all_uniform_p = false;
1638 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1639 n_vector_builds++;
1640 }
1641 }
1642 if (all_uniform_p || n_vector_builds > 1)
1643 {
1644 /* Roll back. */
1645 matches[0] = false;
1646 FOR_EACH_VEC_ELT (children, j, child)
1647 vect_free_slp_tree (child);
1648
1649 if (dump_enabled_p ())
1650 dump_printf_loc (MSG_NOTE, vect_location,
1651 "Building parent vector operands from "
1652 "scalars instead\n");
1653 return NULL;
1654 }
1655 }
1656
1657 *tree_size += this_tree_size + 1;
1658 *max_nunits = this_max_nunits;
1659
1660 if (two_operators)
1661 {
1662 /* ??? We'd likely want to either cache in bst_map sth like
1663 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1664 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1665 explicit stmts to put in so the keying on 'stmts' doesn't
1666 work (but we have the same issue with nodes that use 'ops'). */
1667 slp_tree one = new _slp_tree;
1668 slp_tree two = new _slp_tree;
1669 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1670 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1671 SLP_TREE_VECTYPE (one) = vectype;
1672 SLP_TREE_VECTYPE (two) = vectype;
1673 SLP_TREE_CHILDREN (one).safe_splice (children);
1674 SLP_TREE_CHILDREN (two).safe_splice (children);
1675 slp_tree child;
1676 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1677 SLP_TREE_REF_COUNT (child)++;
1678
1679 /* Here we record the original defs since this
1680 node represents the final lane configuration. */
1681 node = vect_create_new_slp_node (stmts, 2);
1682 SLP_TREE_VECTYPE (node) = vectype;
1683 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1684 SLP_TREE_CHILDREN (node).quick_push (one);
1685 SLP_TREE_CHILDREN (node).quick_push (two);
1686 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1687 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1688 enum tree_code ocode = ERROR_MARK;
1689 stmt_vec_info ostmt_info;
1690 unsigned j = 0;
1691 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1692 {
1693 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1694 if (gimple_assign_rhs_code (ostmt) != code0)
1695 {
1696 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1697 ocode = gimple_assign_rhs_code (ostmt);
1698 j = i;
1699 }
1700 else
1701 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1702 }
1703 SLP_TREE_CODE (one) = code0;
1704 SLP_TREE_CODE (two) = ocode;
1705 SLP_TREE_LANES (one) = stmts.length ();
1706 SLP_TREE_LANES (two) = stmts.length ();
1707 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1708 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1709 return node;
1710 }
1711
1712 node = vect_create_new_slp_node (stmts, nops);
1713 SLP_TREE_VECTYPE (node) = vectype;
1714 SLP_TREE_CHILDREN (node).splice (children);
1715 return node;
1716 }
1717
1718 /* Dump a single SLP tree NODE. */
1719
1720 static void
1721 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1722 slp_tree node)
1723 {
1724 unsigned i, j;
1725 slp_tree child;
1726 stmt_vec_info stmt_info;
1727 tree op;
1728
1729 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1730 dump_user_location_t user_loc = loc.get_user_location ();
1731 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1732 SLP_TREE_DEF_TYPE (node) == vect_external_def
1733 ? " (external)"
1734 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1735 ? " (constant)"
1736 : ""), node,
1737 estimated_poly_value (node->max_nunits),
1738 SLP_TREE_REF_COUNT (node));
1739 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1740 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1741 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1742 else
1743 {
1744 dump_printf_loc (metadata, user_loc, "\t{ ");
1745 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1746 dump_printf (metadata, "%T%s ", op,
1747 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1748 dump_printf (metadata, "}\n");
1749 }
1750 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1751 {
1752 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1753 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1754 dump_printf (dump_kind, " %u", j);
1755 dump_printf (dump_kind, " }\n");
1756 }
1757 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1758 {
1759 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1760 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1761 dump_printf (dump_kind, " %u[%u]",
1762 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1763 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1764 dump_printf (dump_kind, " }\n");
1765 }
1766 if (SLP_TREE_CHILDREN (node).is_empty ())
1767 return;
1768 dump_printf_loc (metadata, user_loc, "\tchildren");
1769 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1770 dump_printf (dump_kind, " %p", (void *)child);
1771 dump_printf (dump_kind, "\n");
1772 }
1773
1774 DEBUG_FUNCTION void
1775 debug (slp_tree node)
1776 {
1777 debug_dump_context ctx;
1778 vect_print_slp_tree (MSG_NOTE,
1779 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1780 node);
1781 }
1782
1783 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1784
1785 static void
1786 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1787 slp_tree node, hash_set<slp_tree> &visited)
1788 {
1789 unsigned i;
1790 slp_tree child;
1791
1792 if (visited.add (node))
1793 return;
1794
1795 vect_print_slp_tree (dump_kind, loc, node);
1796
1797 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1798 vect_print_slp_graph (dump_kind, loc, child, visited);
1799 }
1800
1801 static void
1802 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1803 slp_tree entry)
1804 {
1805 hash_set<slp_tree> visited;
1806 vect_print_slp_graph (dump_kind, loc, entry, visited);
1807 }
1808
1809 /* Mark the tree rooted at NODE with PURE_SLP. */
1810
1811 static void
1812 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1813 {
1814 int i;
1815 stmt_vec_info stmt_info;
1816 slp_tree child;
1817
1818 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1819 return;
1820
1821 if (visited.add (node))
1822 return;
1823
1824 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1825 STMT_SLP_TYPE (stmt_info) = pure_slp;
1826
1827 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1828 vect_mark_slp_stmts (child, visited);
1829 }
1830
1831 static void
1832 vect_mark_slp_stmts (slp_tree node)
1833 {
1834 hash_set<slp_tree> visited;
1835 vect_mark_slp_stmts (node, visited);
1836 }
1837
1838 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1839
1840 static void
1841 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1842 {
1843 int i;
1844 stmt_vec_info stmt_info;
1845 slp_tree child;
1846
1847 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1848 return;
1849
1850 if (visited.add (node))
1851 return;
1852
1853 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1854 {
1855 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1856 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1857 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1858 }
1859
1860 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1861 vect_mark_slp_stmts_relevant (child, visited);
1862 }
1863
1864 static void
1865 vect_mark_slp_stmts_relevant (slp_tree node)
1866 {
1867 hash_set<slp_tree> visited;
1868 vect_mark_slp_stmts_relevant (node, visited);
1869 }
1870
1871
1872 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1873
1874 static void
1875 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1876 hash_set<slp_tree> &visited)
1877 {
1878 if (visited.add (node))
1879 return;
1880
1881 if (SLP_TREE_CHILDREN (node).length () == 0)
1882 {
1883 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1884 return;
1885 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1886 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1887 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1888 loads.safe_push (node);
1889 }
1890 else
1891 {
1892 unsigned i;
1893 slp_tree child;
1894 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1895 vect_gather_slp_loads (loads, child, visited);
1896 }
1897 }
1898
1899 static void
1900 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1901 {
1902 hash_set<slp_tree> visited;
1903 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1904 }
1905
1906
1907 /* Find the last store in SLP INSTANCE. */
1908
1909 stmt_vec_info
1910 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1911 {
1912 stmt_vec_info last = NULL;
1913 stmt_vec_info stmt_vinfo;
1914
1915 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1916 {
1917 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1918 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1919 }
1920
1921 return last;
1922 }
1923
1924 /* Find the first stmt in NODE. */
1925
1926 stmt_vec_info
1927 vect_find_first_scalar_stmt_in_slp (slp_tree node)
1928 {
1929 stmt_vec_info first = NULL;
1930 stmt_vec_info stmt_vinfo;
1931
1932 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1933 {
1934 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1935 if (!first
1936 || get_later_stmt (stmt_vinfo, first) == first)
1937 first = stmt_vinfo;
1938 }
1939
1940 return first;
1941 }
1942
1943 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1944 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1945 (also containing the first GROUP1_SIZE stmts, since stores are
1946 consecutive), the second containing the remainder.
1947 Return the first stmt in the second group. */
1948
1949 static stmt_vec_info
1950 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1951 {
1952 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1953 gcc_assert (group1_size > 0);
1954 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1955 gcc_assert (group2_size > 0);
1956 DR_GROUP_SIZE (first_vinfo) = group1_size;
1957
1958 stmt_vec_info stmt_info = first_vinfo;
1959 for (unsigned i = group1_size; i > 1; i--)
1960 {
1961 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1962 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1963 }
1964 /* STMT is now the last element of the first group. */
1965 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1966 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1967
1968 DR_GROUP_SIZE (group2) = group2_size;
1969 for (stmt_info = group2; stmt_info;
1970 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1971 {
1972 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1973 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1974 }
1975
1976 /* For the second group, the DR_GROUP_GAP is that before the original group,
1977 plus skipping over the first vector. */
1978 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1979
1980 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1981 DR_GROUP_GAP (first_vinfo) += group2_size;
1982
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1985 group1_size, group2_size);
1986
1987 return group2;
1988 }
1989
1990 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1991 statements and a vector of NUNITS elements. */
1992
1993 static poly_uint64
1994 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1995 {
1996 return exact_div (common_multiple (nunits, group_size), group_size);
1997 }
1998
1999 /* Analyze an SLP instance starting from a group of grouped stores. Call
2000 vect_build_slp_tree to build a tree of packed stmts if possible.
2001 Return FALSE if it's impossible to SLP any stmt in the loop. */
2002
2003 static bool
2004 vect_analyze_slp_instance (vec_info *vinfo,
2005 scalar_stmts_to_slp_tree_map_t *bst_map,
2006 stmt_vec_info stmt_info, unsigned max_tree_size)
2007 {
2008 slp_instance new_instance;
2009 slp_tree node;
2010 unsigned int group_size;
2011 tree vectype, scalar_type = NULL_TREE;
2012 unsigned int i;
2013 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2014 vec<stmt_vec_info> scalar_stmts;
2015 bool constructor = false;
2016
2017 if (is_a <bb_vec_info> (vinfo))
2018 vect_location = stmt_info->stmt;
2019 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2020 {
2021 scalar_type = TREE_TYPE (DR_REF (dr));
2022 group_size = DR_GROUP_SIZE (stmt_info);
2023 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2024 }
2025 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2026 {
2027 gcc_assert (is_a <loop_vec_info> (vinfo));
2028 vectype = STMT_VINFO_VECTYPE (stmt_info);
2029 group_size = REDUC_GROUP_SIZE (stmt_info);
2030 }
2031 else if (is_gimple_assign (stmt_info->stmt)
2032 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2033 {
2034 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2035 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2036 constructor = true;
2037 }
2038 else
2039 {
2040 gcc_assert (is_a <loop_vec_info> (vinfo));
2041 vectype = STMT_VINFO_VECTYPE (stmt_info);
2042 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2043 }
2044
2045 if (!vectype)
2046 {
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049 "Build SLP failed: unsupported data-type %T\n",
2050 scalar_type);
2051
2052 return false;
2053 }
2054 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2055
2056 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2057 scalar_stmts.create (group_size);
2058 stmt_vec_info next_info = stmt_info;
2059 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2060 {
2061 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2062 while (next_info)
2063 {
2064 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2065 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2066 }
2067 }
2068 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2069 {
2070 /* Collect the reduction stmts and store them in
2071 SLP_TREE_SCALAR_STMTS. */
2072 while (next_info)
2073 {
2074 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2075 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2076 }
2077 /* Mark the first element of the reduction chain as reduction to properly
2078 transform the node. In the reduction analysis phase only the last
2079 element of the chain is marked as reduction. */
2080 STMT_VINFO_DEF_TYPE (stmt_info)
2081 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2082 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2083 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2084 }
2085 else if (constructor)
2086 {
2087 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2088 tree val;
2089 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2090 {
2091 if (TREE_CODE (val) == SSA_NAME)
2092 {
2093 gimple* def = SSA_NAME_DEF_STMT (val);
2094 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2095 /* Value is defined in another basic block. */
2096 if (!def_info)
2097 return false;
2098 def_info = vect_stmt_to_vectorize (def_info);
2099 scalar_stmts.safe_push (def_info);
2100 }
2101 else
2102 return false;
2103 }
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "Analyzing vectorizable constructor: %G\n",
2107 stmt_info->stmt);
2108 }
2109 else
2110 {
2111 /* Collect reduction statements. */
2112 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2113 for (i = 0; reductions.iterate (i, &next_info); i++)
2114 scalar_stmts.safe_push (next_info);
2115 }
2116
2117 if (dump_enabled_p ())
2118 {
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Starting SLP discovery for\n");
2121 for (i = 0; i < scalar_stmts.length (); ++i)
2122 dump_printf_loc (MSG_NOTE, vect_location,
2123 " %G", scalar_stmts[i]->stmt);
2124 }
2125
2126 /* Build the tree for the SLP instance. */
2127 bool *matches = XALLOCAVEC (bool, group_size);
2128 unsigned npermutes = 0;
2129 poly_uint64 max_nunits = nunits;
2130 unsigned tree_size = 0;
2131 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2132 &max_nunits, matches, &npermutes,
2133 &tree_size, bst_map);
2134 if (node != NULL)
2135 {
2136 /* Calculate the unrolling factor based on the smallest type. */
2137 poly_uint64 unrolling_factor
2138 = calculate_unrolling_factor (max_nunits, group_size);
2139
2140 if (maybe_ne (unrolling_factor, 1U)
2141 && is_a <bb_vec_info> (vinfo))
2142 {
2143 unsigned HOST_WIDE_INT const_max_nunits;
2144 if (!max_nunits.is_constant (&const_max_nunits)
2145 || const_max_nunits > group_size)
2146 {
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "Build SLP failed: store group "
2150 "size not a multiple of the vector size "
2151 "in basic block SLP\n");
2152 vect_free_slp_tree (node);
2153 return false;
2154 }
2155 /* Fatal mismatch. */
2156 if (dump_enabled_p ())
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "SLP discovery succeeded but node needs "
2159 "splitting\n");
2160 memset (matches, true, group_size);
2161 matches[group_size / const_max_nunits * const_max_nunits] = false;
2162 vect_free_slp_tree (node);
2163 }
2164 else
2165 {
2166 /* Create a new SLP instance. */
2167 new_instance = XNEW (class _slp_instance);
2168 SLP_INSTANCE_TREE (new_instance) = node;
2169 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2170 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2171 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2172 new_instance->reduc_phis = NULL;
2173 new_instance->cost_vec = vNULL;
2174 new_instance->subgraph_entries = vNULL;
2175
2176 vect_gather_slp_loads (new_instance, node);
2177 if (dump_enabled_p ())
2178 dump_printf_loc (MSG_NOTE, vect_location,
2179 "SLP size %u vs. limit %u.\n",
2180 tree_size, max_tree_size);
2181
2182 /* Check whether any load is possibly permuted. */
2183 slp_tree load_node;
2184 bool loads_permuted = false;
2185 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2186 {
2187 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2188 continue;
2189 unsigned j;
2190 stmt_vec_info load_info;
2191 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2192 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2193 {
2194 loads_permuted = true;
2195 break;
2196 }
2197 }
2198
2199 /* If the loads and stores can be handled with load/store-lane
2200 instructions do not generate this SLP instance. */
2201 if (is_a <loop_vec_info> (vinfo)
2202 && loads_permuted
2203 && dr && vect_store_lanes_supported (vectype, group_size, false))
2204 {
2205 slp_tree load_node;
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2207 {
2208 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2209 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2210 /* Use SLP for strided accesses (or if we can't load-lanes). */
2211 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2212 || ! vect_load_lanes_supported
2213 (STMT_VINFO_VECTYPE (stmt_vinfo),
2214 DR_GROUP_SIZE (stmt_vinfo), false))
2215 break;
2216 }
2217 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2218 {
2219 if (dump_enabled_p ())
2220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2221 "Built SLP cancelled: can use "
2222 "load/store-lanes\n");
2223 vect_free_slp_instance (new_instance);
2224 return false;
2225 }
2226 }
2227
2228 /* If this is a reduction chain with a conversion in front
2229 amend the SLP tree with a node for that. */
2230 if (!dr
2231 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2232 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2233 {
2234 /* Get at the conversion stmt - we know it's the single use
2235 of the last stmt of the reduction chain. */
2236 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2237 use_operand_p use_p;
2238 gimple *use_stmt;
2239 bool r = single_imm_use (gimple_assign_lhs (tem),
2240 &use_p, &use_stmt);
2241 gcc_assert (r);
2242 next_info = vinfo->lookup_stmt (use_stmt);
2243 next_info = vect_stmt_to_vectorize (next_info);
2244 scalar_stmts = vNULL;
2245 scalar_stmts.create (group_size);
2246 for (unsigned i = 0; i < group_size; ++i)
2247 scalar_stmts.quick_push (next_info);
2248 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2249 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2250 SLP_TREE_CHILDREN (conv).quick_push (node);
2251 SLP_INSTANCE_TREE (new_instance) = conv;
2252 /* We also have to fake this conversion stmt as SLP reduction
2253 group so we don't have to mess with too much code
2254 elsewhere. */
2255 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2256 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2257 }
2258
2259 vinfo->slp_instances.safe_push (new_instance);
2260
2261 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2262 the number of scalar stmts in the root in a few places.
2263 Verify that assumption holds. */
2264 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2265 .length () == group_size);
2266
2267 if (dump_enabled_p ())
2268 {
2269 dump_printf_loc (MSG_NOTE, vect_location,
2270 "Final SLP tree for instance:\n");
2271 vect_print_slp_graph (MSG_NOTE, vect_location,
2272 SLP_INSTANCE_TREE (new_instance));
2273 }
2274
2275 return true;
2276 }
2277 }
2278 else
2279 {
2280 /* Failed to SLP. */
2281 /* Free the allocated memory. */
2282 scalar_stmts.release ();
2283 }
2284
2285 /* Try to break the group up into pieces. */
2286 unsigned HOST_WIDE_INT const_nunits;
2287 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2288 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))
2289 && nunits.is_constant (&const_nunits))
2290 {
2291 for (i = 0; i < group_size; i++)
2292 if (!matches[i])
2293 break;
2294
2295 /* For basic block SLP, try to break the group up into multiples of the
2296 vector size. */
2297 if (is_a <bb_vec_info> (vinfo)
2298 && (i >= const_nunits && i < group_size))
2299 {
2300 /* Split into two groups at the first vector boundary before i. */
2301 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2302 unsigned group1_size = i & ~(const_nunits - 1);
2303
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "Splitting SLP group at stmt %u\n", i);
2307 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2308 group1_size);
2309 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2310 max_tree_size);
2311 /* If the first non-match was in the middle of a vector,
2312 skip the rest of that vector. Do not bother to re-analyze
2313 single stmt groups. */
2314 if (group1_size < i)
2315 {
2316 i = group1_size + const_nunits;
2317 if (i + 1 < group_size)
2318 rest = vect_split_slp_store_group (rest, const_nunits);
2319 }
2320 if (i + 1 < group_size)
2321 res |= vect_analyze_slp_instance (vinfo, bst_map,
2322 rest, max_tree_size);
2323 return res;
2324 }
2325
2326 /* For loop vectorization split into arbitrary pieces of size > 1. */
2327 if (is_a <loop_vec_info> (vinfo)
2328 && (i > 1 && i < group_size))
2329 {
2330 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2331 unsigned group1_size = i;
2332
2333 if (dump_enabled_p ())
2334 dump_printf_loc (MSG_NOTE, vect_location,
2335 "Splitting SLP group at stmt %u\n", i);
2336
2337 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2338 group1_size);
2339 /* Loop vectorization cannot handle gaps in stores, make sure
2340 the split group appears as strided. */
2341 STMT_VINFO_STRIDED_P (rest) = 1;
2342 DR_GROUP_GAP (rest) = 0;
2343 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2344 DR_GROUP_GAP (stmt_info) = 0;
2345
2346 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2347 max_tree_size);
2348 if (i + 1 < group_size)
2349 res |= vect_analyze_slp_instance (vinfo, bst_map,
2350 rest, max_tree_size);
2351
2352 return res;
2353 }
2354
2355 /* Even though the first vector did not all match, we might be able to SLP
2356 (some) of the remainder. FORNOW ignore this possibility. */
2357 }
2358
2359 /* Failed to SLP. */
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2362 return false;
2363 }
2364
2365 /* Fill in backedge SLP children in the SLP graph. */
2366
2367 static void
2368 vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node,
2369 scalar_stmts_to_slp_tree_map_t *bst_map,
2370 hash_set<slp_tree> &visited)
2371 {
2372 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
2373 || visited.add (node))
2374 return;
2375
2376 slp_tree child;
2377 unsigned i;
2378 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2379 if (child)
2380 vect_analyze_slp_backedges (vinfo, child, bst_map, visited);
2381
2382 if (gphi *phi = dyn_cast <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
2383 for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i)
2384 {
2385 auto_vec<stmt_vec_info, 64> stmts;
2386 unsigned j;
2387 stmt_vec_info phi_info;
2388 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info)
2389 {
2390 tree def = gimple_phi_arg_def (as_a <gphi *>(phi_info->stmt), i);
2391 stmt_vec_info def_info = vinfo->lookup_def (def);
2392 if (!def_info)
2393 break;
2394 stmts.safe_push (vect_stmt_to_vectorize (def_info));
2395 }
2396 if (j != SLP_TREE_LANES (node))
2397 continue;
2398 slp_tree *edge_def = bst_map->get (stmts);
2399 if (edge_def)
2400 {
2401 /* ??? We're currently not recording non-backedge children
2402 of PHIs like external reduction initial values. Avoid
2403 NULL entries in SLP_TREE_CHILDREN for those and thus
2404 for now simply only record backedge defs at a
2405 SLP_TREE_CHILDREN index possibly not matching that of
2406 the corresponding PHI argument index. */
2407 SLP_TREE_CHILDREN (node).quick_push (*edge_def);
2408 (*edge_def)->refcnt++;
2409 }
2410 }
2411 }
2412
2413 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2414 trees of packed scalar stmts if SLP is possible. */
2415
2416 opt_result
2417 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2418 {
2419 unsigned int i;
2420 stmt_vec_info first_element;
2421
2422 DUMP_VECT_SCOPE ("vect_analyze_slp");
2423
2424 scalar_stmts_to_slp_tree_map_t *bst_map
2425 = new scalar_stmts_to_slp_tree_map_t ();
2426
2427 /* Find SLP sequences starting from groups of grouped stores. */
2428 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2429 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2430
2431 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2432 {
2433 if (loop_vinfo->reduction_chains.length () > 0)
2434 {
2435 /* Find SLP sequences starting from reduction chains. */
2436 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2437 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2438 max_tree_size))
2439 {
2440 /* Dissolve reduction chain group. */
2441 stmt_vec_info vinfo = first_element;
2442 stmt_vec_info last = NULL;
2443 while (vinfo)
2444 {
2445 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2446 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2447 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2448 last = vinfo;
2449 vinfo = next;
2450 }
2451 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2452 /* It can be still vectorized as part of an SLP reduction. */
2453 loop_vinfo->reductions.safe_push (last);
2454 }
2455 }
2456
2457 /* Find SLP sequences starting from groups of reductions. */
2458 if (loop_vinfo->reductions.length () > 1)
2459 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2460 max_tree_size);
2461 }
2462
2463 /* Fill in backedges. */
2464 slp_instance instance;
2465 hash_set<slp_tree> visited;
2466 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2467 vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance),
2468 bst_map, visited);
2469
2470 /* The map keeps a reference on SLP nodes built, release that. */
2471 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2472 it != bst_map->end (); ++it)
2473 if ((*it).second)
2474 vect_free_slp_tree ((*it).second);
2475 delete bst_map;
2476
2477 return opt_result::success ();
2478 }
2479
2480 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2481
2482 static void
2483 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2484 vec<slp_tree> &vertices, vec<int> &leafs)
2485 {
2486 unsigned i;
2487 slp_tree child;
2488
2489 if (visited.add (node))
2490 return;
2491
2492 node->vertex = vertices.length ();
2493 vertices.safe_push (node);
2494 if (SLP_TREE_CHILDREN (node).is_empty ())
2495 leafs.safe_push (node->vertex);
2496 else
2497 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2498 vect_slp_build_vertices (visited, child, vertices, leafs);
2499 }
2500
2501 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2502
2503 static void
2504 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2505 vec<int> &leafs)
2506 {
2507 hash_set<slp_tree> visited;
2508 unsigned i;
2509 slp_instance instance;
2510 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2511 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2512 leafs);
2513 }
2514
2515 /* Apply (reverse) bijectite PERM to VEC. */
2516
2517 template <class T>
2518 static void
2519 vect_slp_permute (vec<unsigned> perm,
2520 vec<T> &vec, bool reverse)
2521 {
2522 auto_vec<T, 64> saved;
2523 saved.create (vec.length ());
2524 for (unsigned i = 0; i < vec.length (); ++i)
2525 saved.quick_push (vec[i]);
2526
2527 if (reverse)
2528 {
2529 for (unsigned i = 0; i < vec.length (); ++i)
2530 vec[perm[i]] = saved[i];
2531 for (unsigned i = 0; i < vec.length (); ++i)
2532 gcc_assert (vec[perm[i]] == saved[i]);
2533 }
2534 else
2535 {
2536 for (unsigned i = 0; i < vec.length (); ++i)
2537 vec[i] = saved[perm[i]];
2538 for (unsigned i = 0; i < vec.length (); ++i)
2539 gcc_assert (vec[i] == saved[perm[i]]);
2540 }
2541 }
2542
2543 /* Return whether permutations PERM_A and PERM_B as recorded in the
2544 PERMS vector are equal. */
2545
2546 static bool
2547 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2548 int perm_a, int perm_b)
2549 {
2550 return (perm_a == perm_b
2551 || (perms[perm_a].length () == perms[perm_b].length ()
2552 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2553 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2554 }
2555
2556 /* Optimize the SLP graph of VINFO. */
2557
2558 void
2559 vect_optimize_slp (vec_info *vinfo)
2560 {
2561 if (vinfo->slp_instances.is_empty ())
2562 return;
2563
2564 slp_tree node;
2565 unsigned i;
2566 auto_vec<slp_tree> vertices;
2567 auto_vec<int> leafs;
2568 vect_slp_build_vertices (vinfo, vertices, leafs);
2569
2570 struct graph *slpg = new_graph (vertices.length ());
2571 FOR_EACH_VEC_ELT (vertices, i, node)
2572 {
2573 unsigned j;
2574 slp_tree child;
2575 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2576 add_edge (slpg, i, child->vertex);
2577 }
2578
2579 /* Compute (reverse) postorder on the inverted graph. */
2580 auto_vec<int> ipo;
2581 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2582
2583 auto_sbitmap n_visited (vertices.length ());
2584 auto_sbitmap n_materialize (vertices.length ());
2585 auto_vec<int> n_perm (vertices.length ());
2586 auto_vec<vec<unsigned> > perms;
2587
2588 bitmap_clear (n_visited);
2589 bitmap_clear (n_materialize);
2590 n_perm.quick_grow_cleared (vertices.length ());
2591 perms.safe_push (vNULL); /* zero is no permute */
2592
2593 /* Produce initial permutations. */
2594 for (i = 0; i < leafs.length (); ++i)
2595 {
2596 int idx = leafs[i];
2597 slp_tree node = vertices[idx];
2598
2599 /* Handle externals and constants optimistically throughout the
2600 iteration. */
2601 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2602 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2603 continue;
2604
2605 /* Loads are the only thing generating permutes and leafs do not
2606 change across iterations. */
2607 bitmap_set_bit (n_visited, idx);
2608 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2609 continue;
2610
2611 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2612 node unpermuted, record this permute. */
2613 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2614 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2615 continue;
2616 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2617 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2618 bool any_permute = false;
2619 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2620 {
2621 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2622 imin = MIN (imin, idx);
2623 imax = MAX (imax, idx);
2624 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2625 any_permute = true;
2626 }
2627 /* If there's no permute no need to split one out. */
2628 if (!any_permute)
2629 continue;
2630 /* If the span doesn't match we'd disrupt VF computation, avoid
2631 that for now. */
2632 if (imax - imin + 1 != SLP_TREE_LANES (node))
2633 continue;
2634
2635 /* For now only handle true permutes, like
2636 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2637 when permuting constants and invariants keeping the permute
2638 bijective. */
2639 auto_sbitmap load_index (SLP_TREE_LANES (node));
2640 bitmap_clear (load_index);
2641 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2642 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2643 unsigned j;
2644 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2645 if (!bitmap_bit_p (load_index, j))
2646 break;
2647 if (j != SLP_TREE_LANES (node))
2648 continue;
2649
2650 vec<unsigned> perm = vNULL;
2651 perm.safe_grow (SLP_TREE_LANES (node), true);
2652 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2653 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2654 perms.safe_push (perm);
2655 n_perm[idx] = perms.length () - 1;
2656 }
2657
2658 /* Propagate permutes along the graph and compute materialization points. */
2659 bool changed;
2660 unsigned iteration = 0;
2661 do
2662 {
2663 changed = false;
2664 ++iteration;
2665
2666 for (i = vertices.length (); i > 0 ; --i)
2667 {
2668 int idx = ipo[i-1];
2669 slp_tree node = vertices[idx];
2670 /* For leafs there's nothing to do - we've seeded permutes
2671 on those above. */
2672 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2673 continue;
2674
2675 bitmap_set_bit (n_visited, idx);
2676
2677 /* We cannot move a permute across a store. */
2678 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2679 && DR_IS_WRITE
2680 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2681 continue;
2682
2683 int perm = -1;
2684 for (graph_edge *succ = slpg->vertices[idx].succ;
2685 succ; succ = succ->succ_next)
2686 {
2687 int succ_idx = succ->dest;
2688 /* Handle unvisited nodes optimistically. */
2689 /* ??? But for constants once we want to handle non-bijective
2690 permutes we have to verify the permute, when unifying lanes,
2691 will not unify different constants. For example see
2692 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2693 if (!bitmap_bit_p (n_visited, succ_idx))
2694 continue;
2695 int succ_perm = n_perm[succ_idx];
2696 /* Once we materialize succs permutation its output lanes
2697 appear unpermuted to us. */
2698 if (bitmap_bit_p (n_materialize, succ_idx))
2699 succ_perm = 0;
2700 if (perm == -1)
2701 perm = succ_perm;
2702 else if (succ_perm == 0)
2703 {
2704 perm = 0;
2705 break;
2706 }
2707 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2708 {
2709 perm = 0;
2710 break;
2711 }
2712 }
2713
2714 if (perm == -1)
2715 /* Pick up pre-computed leaf values. */
2716 perm = n_perm[idx];
2717 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2718 {
2719 if (iteration > 1)
2720 /* Make sure we eventually converge. */
2721 gcc_checking_assert (perm == 0);
2722 n_perm[idx] = perm;
2723 if (perm == 0)
2724 bitmap_clear_bit (n_materialize, idx);
2725 changed = true;
2726 }
2727
2728 if (perm == 0)
2729 continue;
2730
2731 /* Elide pruning at materialization points in the first
2732 iteration so every node was visited once at least. */
2733 if (iteration == 1)
2734 continue;
2735
2736 /* Decide on permute materialization. Look whether there's
2737 a use (pred) edge that is permuted differently than us.
2738 In that case mark ourselves so the permutation is applied. */
2739 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2740 for (graph_edge *pred = slpg->vertices[idx].pred;
2741 pred; pred = pred->pred_next)
2742 {
2743 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2744 int pred_perm = n_perm[pred->src];
2745 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2746 {
2747 all_preds_permuted = false;
2748 break;
2749 }
2750 }
2751 if (!all_preds_permuted)
2752 {
2753 if (!bitmap_bit_p (n_materialize, idx))
2754 changed = true;
2755 bitmap_set_bit (n_materialize, idx);
2756 }
2757 }
2758 }
2759 while (changed || iteration == 1);
2760
2761 /* Materialize. */
2762 for (i = 0; i < vertices.length (); ++i)
2763 {
2764 int perm = n_perm[i];
2765 if (perm <= 0)
2766 continue;
2767
2768 slp_tree node = vertices[i];
2769
2770 /* First permute invariant/external original successors. */
2771 unsigned j;
2772 slp_tree child;
2773 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2774 {
2775 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2776 continue;
2777
2778 /* If the vector is uniform there's nothing to do. */
2779 if (vect_slp_tree_uniform_p (child))
2780 continue;
2781
2782 /* We can end up sharing some externals via two_operator
2783 handling. Be prepared to unshare those. */
2784 if (child->refcnt != 1)
2785 {
2786 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2787 SLP_TREE_CHILDREN (node)[j] = child
2788 = vect_create_new_slp_node
2789 (SLP_TREE_SCALAR_OPS (child).copy ());
2790 }
2791 vect_slp_permute (perms[perm],
2792 SLP_TREE_SCALAR_OPS (child), true);
2793 }
2794
2795 if (bitmap_bit_p (n_materialize, i))
2796 {
2797 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2798 /* For loads simply drop the permutation, the load permutation
2799 already performs the desired permutation. */
2800 ;
2801 else
2802 {
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_NOTE, vect_location,
2805 "inserting permute node in place of %p\n",
2806 node);
2807
2808 /* Make a copy of NODE and in-place change it to a
2809 VEC_PERM node to permute the lanes of the copy. */
2810 slp_tree copy = new _slp_tree;
2811 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2812 SLP_TREE_CHILDREN (node) = vNULL;
2813 SLP_TREE_SCALAR_STMTS (copy)
2814 = SLP_TREE_SCALAR_STMTS (node).copy ();
2815 vect_slp_permute (perms[perm],
2816 SLP_TREE_SCALAR_STMTS (copy), true);
2817 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2818 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2819 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2820 SLP_TREE_LANE_PERMUTATION (copy)
2821 = SLP_TREE_LANE_PERMUTATION (node);
2822 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
2823 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
2824 copy->refcnt = 1;
2825 copy->max_nunits = node->max_nunits;
2826 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
2827 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
2828 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
2829
2830 /* Now turn NODE into a VEC_PERM. */
2831 SLP_TREE_CHILDREN (node).safe_push (copy);
2832 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
2833 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2834 SLP_TREE_LANE_PERMUTATION (node)
2835 .quick_push (std::make_pair (0, perms[perm][j]));
2836 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2837 }
2838 }
2839 else
2840 {
2841 /* Apply the reverse permutation to our stmts. */
2842 vect_slp_permute (perms[perm],
2843 SLP_TREE_SCALAR_STMTS (node), true);
2844 /* And to the load permutation, which we can simply
2845 make regular by design. */
2846 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2847 {
2848 /* ??? When we handle non-bijective permutes the idea
2849 is that we can force the load-permutation to be
2850 { min, min + 1, min + 2, ... max }. But then the
2851 scalar defs might no longer match the lane content
2852 which means wrong-code with live lane vectorization.
2853 So we possibly have to have NULL entries for those. */
2854 vect_slp_permute (perms[perm],
2855 SLP_TREE_LOAD_PERMUTATION (node), true);
2856 }
2857 }
2858 }
2859
2860 /* Free the perms vector used for propagation. */
2861 while (!perms.is_empty ())
2862 perms.pop ().release ();
2863 free_graph (slpg);
2864
2865
2866 /* Now elide load permutations that are not necessary. */
2867 for (i = 0; i < leafs.length (); ++i)
2868 {
2869 node = vertices[i];
2870 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2871 continue;
2872
2873 /* In basic block vectorization we allow any subchain of an interleaving
2874 chain.
2875 FORNOW: not in loop SLP because of realignment complications. */
2876 if (is_a <bb_vec_info> (vinfo))
2877 {
2878 bool subchain_p = true;
2879 stmt_vec_info next_load_info = NULL;
2880 stmt_vec_info load_info;
2881 unsigned j;
2882 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2883 {
2884 if (j != 0
2885 && (next_load_info != load_info
2886 || DR_GROUP_GAP (load_info) != 1))
2887 {
2888 subchain_p = false;
2889 break;
2890 }
2891 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2892 }
2893 if (subchain_p)
2894 {
2895 SLP_TREE_LOAD_PERMUTATION (node).release ();
2896 continue;
2897 }
2898 }
2899 else
2900 {
2901 stmt_vec_info load_info;
2902 bool this_load_permuted = false;
2903 unsigned j;
2904 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2905 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2906 {
2907 this_load_permuted = true;
2908 break;
2909 }
2910 stmt_vec_info first_stmt_info
2911 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2912 if (!this_load_permuted
2913 /* The load requires permutation when unrolling exposes
2914 a gap either because the group is larger than the SLP
2915 group-size or because there is a gap between the groups. */
2916 && (known_eq (LOOP_VINFO_VECT_FACTOR
2917 (as_a <loop_vec_info> (vinfo)), 1U)
2918 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2919 && DR_GROUP_GAP (first_stmt_info) == 0)))
2920 {
2921 SLP_TREE_LOAD_PERMUTATION (node).release ();
2922 continue;
2923 }
2924 }
2925 }
2926 }
2927
2928
2929 /* For each possible SLP instance decide whether to SLP it and calculate overall
2930 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2931 least one instance. */
2932
2933 bool
2934 vect_make_slp_decision (loop_vec_info loop_vinfo)
2935 {
2936 unsigned int i;
2937 poly_uint64 unrolling_factor = 1;
2938 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2939 slp_instance instance;
2940 int decided_to_slp = 0;
2941
2942 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2943
2944 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2945 {
2946 /* FORNOW: SLP if you can. */
2947 /* All unroll factors have the form:
2948
2949 GET_MODE_SIZE (vinfo->vector_mode) * X
2950
2951 for some rational X, so they must have a common multiple. */
2952 unrolling_factor
2953 = force_common_multiple (unrolling_factor,
2954 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2955
2956 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2957 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2958 loop-based vectorization. Such stmts will be marked as HYBRID. */
2959 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2960 decided_to_slp++;
2961 }
2962
2963 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2964
2965 if (decided_to_slp && dump_enabled_p ())
2966 {
2967 dump_printf_loc (MSG_NOTE, vect_location,
2968 "Decided to SLP %d instances. Unrolling factor ",
2969 decided_to_slp);
2970 dump_dec (MSG_NOTE, unrolling_factor);
2971 dump_printf (MSG_NOTE, "\n");
2972 }
2973
2974 return (decided_to_slp > 0);
2975 }
2976
2977 /* Private data for vect_detect_hybrid_slp. */
2978 struct vdhs_data
2979 {
2980 loop_vec_info loop_vinfo;
2981 vec<stmt_vec_info> *worklist;
2982 };
2983
2984 /* Walker for walk_gimple_op. */
2985
2986 static tree
2987 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2988 {
2989 walk_stmt_info *wi = (walk_stmt_info *)data;
2990 vdhs_data *dat = (vdhs_data *)wi->info;
2991
2992 if (wi->is_lhs)
2993 return NULL_TREE;
2994
2995 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2996 if (!def_stmt_info)
2997 return NULL_TREE;
2998 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2999 if (PURE_SLP_STMT (def_stmt_info))
3000 {
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3003 def_stmt_info->stmt);
3004 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3005 dat->worklist->safe_push (def_stmt_info);
3006 }
3007
3008 return NULL_TREE;
3009 }
3010
3011 /* Find stmts that must be both vectorized and SLPed. */
3012
3013 void
3014 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3015 {
3016 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3017
3018 /* All stmts participating in SLP are marked pure_slp, all other
3019 stmts are loop_vect.
3020 First collect all loop_vect stmts into a worklist. */
3021 auto_vec<stmt_vec_info> worklist;
3022 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3023 {
3024 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3025 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3026 gsi_next (&gsi))
3027 {
3028 gphi *phi = gsi.phi ();
3029 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3030 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3031 worklist.safe_push (stmt_info);
3032 }
3033 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3034 gsi_next (&gsi))
3035 {
3036 gimple *stmt = gsi_stmt (gsi);
3037 if (is_gimple_debug (stmt))
3038 continue;
3039 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3040 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3041 {
3042 for (gimple_stmt_iterator gsi2
3043 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3044 !gsi_end_p (gsi2); gsi_next (&gsi2))
3045 {
3046 stmt_vec_info patt_info
3047 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3048 if (!STMT_SLP_TYPE (patt_info)
3049 && STMT_VINFO_RELEVANT (patt_info))
3050 worklist.safe_push (patt_info);
3051 }
3052 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3053 }
3054 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3055 worklist.safe_push (stmt_info);
3056 }
3057 }
3058
3059 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3060 mark any SLP vectorized stmt as hybrid.
3061 ??? We're visiting def stmts N times (once for each non-SLP and
3062 once for each hybrid-SLP use). */
3063 walk_stmt_info wi;
3064 vdhs_data dat;
3065 dat.worklist = &worklist;
3066 dat.loop_vinfo = loop_vinfo;
3067 memset (&wi, 0, sizeof (wi));
3068 wi.info = (void *)&dat;
3069 while (!worklist.is_empty ())
3070 {
3071 stmt_vec_info stmt_info = worklist.pop ();
3072 /* Since SSA operands are not set up for pattern stmts we need
3073 to use walk_gimple_op. */
3074 wi.is_lhs = 0;
3075 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3076 }
3077 }
3078
3079
3080 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3081
3082 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3083 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3084 {
3085 for (unsigned i = 0; i < bbs.length (); ++i)
3086 {
3087 if (i != 0)
3088 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3089 gsi_next (&si))
3090 {
3091 gphi *phi = si.phi ();
3092 gimple_set_uid (phi, 0);
3093 add_stmt (phi);
3094 }
3095 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3096 !gsi_end_p (gsi); gsi_next (&gsi))
3097 {
3098 gimple *stmt = gsi_stmt (gsi);
3099 gimple_set_uid (stmt, 0);
3100 if (is_gimple_debug (stmt))
3101 continue;
3102 add_stmt (stmt);
3103 }
3104 }
3105 }
3106
3107
3108 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3109 stmts in the basic block. */
3110
3111 _bb_vec_info::~_bb_vec_info ()
3112 {
3113 /* Reset region marker. */
3114 for (unsigned i = 0; i < bbs.length (); ++i)
3115 {
3116 if (i != 0)
3117 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3118 gsi_next (&si))
3119 {
3120 gphi *phi = si.phi ();
3121 gimple_set_uid (phi, -1);
3122 }
3123 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3124 !gsi_end_p (gsi); gsi_next (&gsi))
3125 {
3126 gimple *stmt = gsi_stmt (gsi);
3127 gimple_set_uid (stmt, -1);
3128 }
3129 }
3130 }
3131
3132 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3133 given then that child nodes have already been processed, and that
3134 their def types currently match their SLP node's def type. */
3135
3136 static bool
3137 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3138 slp_instance node_instance,
3139 stmt_vector_for_cost *cost_vec)
3140 {
3141 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3142 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3143
3144 /* Calculate the number of vector statements to be created for the
3145 scalar stmts in this node. For SLP reductions it is equal to the
3146 number of vector statements in the children (which has already been
3147 calculated by the recursive call). Otherwise it is the number of
3148 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3149 VF divided by the number of elements in a vector. */
3150 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3151 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3152 {
3153 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3154 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3155 {
3156 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3157 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3158 break;
3159 }
3160 }
3161 else
3162 {
3163 poly_uint64 vf;
3164 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3165 vf = loop_vinfo->vectorization_factor;
3166 else
3167 vf = 1;
3168 unsigned int group_size = SLP_TREE_LANES (node);
3169 tree vectype = SLP_TREE_VECTYPE (node);
3170 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3171 = vect_get_num_vectors (vf * group_size, vectype);
3172 }
3173
3174 /* Handle purely internal nodes. */
3175 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3176 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3177
3178 if (is_a <bb_vec_info> (vinfo)
3179 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3180 return false;
3181
3182 bool dummy;
3183 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3184 node, node_instance, cost_vec);
3185 }
3186
3187 /* Try to build NODE from scalars, returning true on success.
3188 NODE_INSTANCE is the SLP instance that contains NODE. */
3189
3190 static bool
3191 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3192 slp_instance node_instance)
3193 {
3194 stmt_vec_info stmt_info;
3195 unsigned int i;
3196
3197 if (!is_a <bb_vec_info> (vinfo)
3198 || node == SLP_INSTANCE_TREE (node_instance)
3199 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3200 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3201 return false;
3202
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location,
3205 "Building vector operands from scalars instead\n");
3206
3207 /* Don't remove and free the child nodes here, since they could be
3208 referenced by other structures. The analysis and scheduling phases
3209 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3210 unsigned int group_size = SLP_TREE_LANES (node);
3211 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3212 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3213 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3214 {
3215 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3216 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3217 }
3218 return true;
3219 }
3220
3221 /* Compute the prologue cost for invariant or constant operands represented
3222 by NODE. */
3223
3224 static void
3225 vect_prologue_cost_for_slp (slp_tree node,
3226 stmt_vector_for_cost *cost_vec)
3227 {
3228 /* There's a special case of an existing vector, that costs nothing. */
3229 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3230 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3231 return;
3232 /* Without looking at the actual initializer a vector of
3233 constants can be implemented as load from the constant pool.
3234 When all elements are the same we can use a splat. */
3235 tree vectype = SLP_TREE_VECTYPE (node);
3236 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3237 unsigned num_vects_to_check;
3238 unsigned HOST_WIDE_INT const_nunits;
3239 unsigned nelt_limit;
3240 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3241 && ! multiple_p (const_nunits, group_size))
3242 {
3243 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3244 nelt_limit = const_nunits;
3245 }
3246 else
3247 {
3248 /* If either the vector has variable length or the vectors
3249 are composed of repeated whole groups we only need to
3250 cost construction once. All vectors will be the same. */
3251 num_vects_to_check = 1;
3252 nelt_limit = group_size;
3253 }
3254 tree elt = NULL_TREE;
3255 unsigned nelt = 0;
3256 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3257 {
3258 unsigned si = j % group_size;
3259 if (nelt == 0)
3260 elt = SLP_TREE_SCALAR_OPS (node)[si];
3261 /* ??? We're just tracking whether all operands of a single
3262 vector initializer are the same, ideally we'd check if
3263 we emitted the same one already. */
3264 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3265 elt = NULL_TREE;
3266 nelt++;
3267 if (nelt == nelt_limit)
3268 {
3269 record_stmt_cost (cost_vec, 1,
3270 SLP_TREE_DEF_TYPE (node) == vect_external_def
3271 ? (elt ? scalar_to_vec : vec_construct)
3272 : vector_load,
3273 NULL, vectype, 0, vect_prologue);
3274 nelt = 0;
3275 }
3276 }
3277 }
3278
3279 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3280 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3281
3282 Return true if the operations are supported. */
3283
3284 static bool
3285 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3286 slp_instance node_instance,
3287 hash_set<slp_tree> &visited,
3288 hash_set<slp_tree> &lvisited,
3289 stmt_vector_for_cost *cost_vec)
3290 {
3291 int i, j;
3292 slp_tree child;
3293
3294 /* Assume we can code-generate all invariants. */
3295 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3296 return true;
3297
3298 /* If we already analyzed the exact same set of scalar stmts we're done.
3299 We share the generated vector stmts for those. */
3300 if (visited.contains (node)
3301 || lvisited.add (node))
3302 return true;
3303
3304 bool res = true;
3305 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3306 {
3307 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3308 visited, lvisited, cost_vec);
3309 if (!res)
3310 break;
3311 }
3312
3313 if (res)
3314 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3315 cost_vec);
3316 if (!res)
3317 lvisited.remove (node);
3318
3319 /* When the node can be vectorized cost invariant nodes it references.
3320 This is not done in DFS order to allow the refering node
3321 vectorizable_* calls to nail down the invariant nodes vector type
3322 and possibly unshare it if it needs a different vector type than
3323 other referrers. */
3324 if (res)
3325 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3326 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
3327 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3328 /* Perform usual caching, note code-generation still
3329 code-gens these nodes multiple times but we expect
3330 to CSE them later. */
3331 && !visited.contains (child)
3332 && !lvisited.add (child))
3333 {
3334 /* ??? After auditing more code paths make a "default"
3335 and push the vector type from NODE to all children
3336 if it is not already set. */
3337 /* Compute the number of vectors to be generated. */
3338 tree vector_type = SLP_TREE_VECTYPE (child);
3339 if (!vector_type)
3340 {
3341 /* For shifts with a scalar argument we don't need
3342 to cost or code-generate anything.
3343 ??? Represent this more explicitely. */
3344 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3345 == shift_vec_info_type)
3346 && j == 1);
3347 continue;
3348 }
3349 unsigned group_size = SLP_TREE_LANES (child);
3350 poly_uint64 vf = 1;
3351 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3352 vf = loop_vinfo->vectorization_factor;
3353 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3354 = vect_get_num_vectors (vf * group_size, vector_type);
3355 /* And cost them. */
3356 vect_prologue_cost_for_slp (child, cost_vec);
3357 }
3358
3359 /* If this node or any of its children can't be vectorized, try pruning
3360 the tree here rather than felling the whole thing. */
3361 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3362 {
3363 /* We'll need to revisit this for invariant costing and number
3364 of vectorized stmt setting. */
3365 res = true;
3366 }
3367
3368 return res;
3369 }
3370
3371
3372 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3373 region and that can be vectorized using vectorizable_live_operation
3374 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3375 scalar code computing it to be retained. */
3376
3377 static void
3378 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3379 slp_instance instance,
3380 stmt_vector_for_cost *cost_vec,
3381 hash_set<stmt_vec_info> &svisited)
3382 {
3383 unsigned i;
3384 stmt_vec_info stmt_info;
3385 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3386 bool all_visited = true;
3387 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3388 {
3389 if (svisited.contains (stmt_info))
3390 continue;
3391 all_visited = false;
3392 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3393 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3394 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3395 /* Only the pattern root stmt computes the original scalar value. */
3396 continue;
3397 bool mark_visited = true;
3398 gimple *orig_stmt = orig_stmt_info->stmt;
3399 ssa_op_iter op_iter;
3400 def_operand_p def_p;
3401 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3402 {
3403 imm_use_iterator use_iter;
3404 gimple *use_stmt;
3405 stmt_vec_info use_stmt_info;
3406 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3407 if (!is_gimple_debug (use_stmt))
3408 {
3409 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3410 if (!use_stmt_info
3411 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3412 {
3413 STMT_VINFO_LIVE_P (stmt_info) = true;
3414 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3415 NULL, node, instance, i,
3416 false, cost_vec))
3417 /* ??? So we know we can vectorize the live stmt
3418 from one SLP node. If we cannot do so from all
3419 or none consistently we'd have to record which
3420 SLP node (and lane) we want to use for the live
3421 operation. So make sure we can code-generate
3422 from all nodes. */
3423 mark_visited = false;
3424 else
3425 STMT_VINFO_LIVE_P (stmt_info) = false;
3426 BREAK_FROM_IMM_USE_STMT (use_iter);
3427 }
3428 }
3429 /* We have to verify whether we can insert the lane extract
3430 before all uses. The following is a conservative approximation.
3431 We cannot put this into vectorizable_live_operation because
3432 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3433 doesn't work.
3434 Note that while the fact that we emit code for loads at the
3435 first load should make this a non-problem leafs we construct
3436 from scalars are vectorized after the last scalar def.
3437 ??? If we'd actually compute the insert location during
3438 analysis we could use sth less conservative than the last
3439 scalar stmt in the node for the dominance check. */
3440 /* ??? What remains is "live" uses in vector CTORs in the same
3441 SLP graph which is where those uses can end up code-generated
3442 right after their definition instead of close to their original
3443 use. But that would restrict us to code-generate lane-extracts
3444 from the latest stmt in a node. So we compensate for this
3445 during code-generation, simply not replacing uses for those
3446 hopefully rare cases. */
3447 if (STMT_VINFO_LIVE_P (stmt_info))
3448 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3449 if (!is_gimple_debug (use_stmt)
3450 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3451 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3452 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3453 {
3454 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3456 "Cannot determine insertion place for "
3457 "lane extract\n");
3458 STMT_VINFO_LIVE_P (stmt_info) = false;
3459 mark_visited = true;
3460 }
3461 }
3462 if (mark_visited)
3463 svisited.add (stmt_info);
3464 }
3465 if (all_visited)
3466 return;
3467
3468 slp_tree child;
3469 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3470 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3471 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3472 cost_vec, svisited);
3473 }
3474
3475 /* Analyze statements in SLP instances of VINFO. Return true if the
3476 operations are supported. */
3477
3478 bool
3479 vect_slp_analyze_operations (vec_info *vinfo)
3480 {
3481 slp_instance instance;
3482 int i;
3483
3484 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3485
3486 hash_set<slp_tree> visited;
3487 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3488 {
3489 hash_set<slp_tree> lvisited;
3490 stmt_vector_for_cost cost_vec;
3491 cost_vec.create (2);
3492 if (is_a <bb_vec_info> (vinfo))
3493 vect_location = instance->location ();
3494 if (!vect_slp_analyze_node_operations (vinfo,
3495 SLP_INSTANCE_TREE (instance),
3496 instance, visited, lvisited,
3497 &cost_vec)
3498 /* Instances with a root stmt require vectorized defs for the
3499 SLP tree root. */
3500 || (SLP_INSTANCE_ROOT_STMT (instance)
3501 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3502 != vect_internal_def)))
3503 {
3504 slp_tree node = SLP_INSTANCE_TREE (instance);
3505 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3506 if (dump_enabled_p ())
3507 dump_printf_loc (MSG_NOTE, vect_location,
3508 "removing SLP instance operations starting from: %G",
3509 stmt_info->stmt);
3510 vect_free_slp_instance (instance);
3511 vinfo->slp_instances.ordered_remove (i);
3512 cost_vec.release ();
3513 }
3514 else
3515 {
3516 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3517 x != lvisited.end(); ++x)
3518 visited.add (*x);
3519 i++;
3520
3521 /* For BB vectorization remember the SLP graph entry
3522 cost for later. */
3523 if (is_a <bb_vec_info> (vinfo))
3524 instance->cost_vec = cost_vec;
3525 else
3526 {
3527 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3528 cost_vec.release ();
3529 }
3530 }
3531 }
3532
3533 /* Compute vectorizable live stmts. */
3534 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3535 {
3536 hash_set<stmt_vec_info> svisited;
3537 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3538 {
3539 vect_location = instance->location ();
3540 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3541 instance, &instance->cost_vec, svisited);
3542 }
3543 }
3544
3545 return !vinfo->slp_instances.is_empty ();
3546 }
3547
3548 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3549 closing the eventual chain. */
3550
3551 static slp_instance
3552 get_ultimate_leader (slp_instance instance,
3553 hash_map<slp_instance, slp_instance> &instance_leader)
3554 {
3555 auto_vec<slp_instance *, 8> chain;
3556 slp_instance *tem;
3557 while (*(tem = instance_leader.get (instance)) != instance)
3558 {
3559 chain.safe_push (tem);
3560 instance = *tem;
3561 }
3562 while (!chain.is_empty ())
3563 *chain.pop () = instance;
3564 return instance;
3565 }
3566
3567 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3568
3569 static void
3570 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3571 slp_instance instance, slp_tree node,
3572 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3573 hash_map<slp_instance, slp_instance> &instance_leader,
3574 hash_set<slp_tree> &visited)
3575 {
3576 stmt_vec_info stmt_info;
3577 unsigned i;
3578
3579 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3580 {
3581 bool existed_p;
3582 slp_instance &stmt_instance
3583 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3584 if (!existed_p)
3585 ;
3586 else if (stmt_instance != instance)
3587 {
3588 /* If we're running into a previously marked stmt make us the
3589 leader of the current ultimate leader. This keeps the
3590 leader chain acyclic and works even when the current instance
3591 connects two previously independent graph parts. */
3592 slp_instance stmt_leader
3593 = get_ultimate_leader (stmt_instance, instance_leader);
3594 if (stmt_leader != instance)
3595 instance_leader.put (stmt_leader, instance);
3596 }
3597 stmt_instance = instance;
3598 }
3599
3600 if (visited.add (node))
3601 return;
3602
3603 slp_tree child;
3604 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3605 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3606 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3607 instance_leader, visited);
3608 }
3609
3610 /* Partition the SLP graph into pieces that can be costed independently. */
3611
3612 static void
3613 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3614 {
3615 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3616
3617 /* First walk the SLP graph assigning each involved scalar stmt a
3618 corresponding SLP graph entry and upon visiting a previously
3619 marked stmt, make the stmts leader the current SLP graph entry. */
3620 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3621 hash_map<slp_instance, slp_instance> instance_leader;
3622 hash_set<slp_tree> visited;
3623 slp_instance instance;
3624 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3625 {
3626 instance_leader.put (instance, instance);
3627 vect_bb_partition_graph_r (bb_vinfo,
3628 instance, SLP_INSTANCE_TREE (instance),
3629 stmt_to_instance, instance_leader,
3630 visited);
3631 }
3632
3633 /* Then collect entries to each independent subgraph. */
3634 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3635 {
3636 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3637 leader->subgraph_entries.safe_push (instance);
3638 if (dump_enabled_p ()
3639 && leader != instance)
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "instance %p is leader of %p\n",
3642 leader, instance);
3643 }
3644 }
3645
3646 /* Compute the scalar cost of the SLP node NODE and its children
3647 and return it. Do not account defs that are marked in LIFE and
3648 update LIFE according to uses of NODE. */
3649
3650 static void
3651 vect_bb_slp_scalar_cost (vec_info *vinfo,
3652 slp_tree node, vec<bool, va_heap> *life,
3653 stmt_vector_for_cost *cost_vec,
3654 hash_set<slp_tree> &visited)
3655 {
3656 unsigned i;
3657 stmt_vec_info stmt_info;
3658 slp_tree child;
3659
3660 if (visited.add (node))
3661 return;
3662
3663 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3664 {
3665 ssa_op_iter op_iter;
3666 def_operand_p def_p;
3667
3668 if ((*life)[i])
3669 continue;
3670
3671 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3672 gimple *orig_stmt = orig_stmt_info->stmt;
3673
3674 /* If there is a non-vectorized use of the defs then the scalar
3675 stmt is kept live in which case we do not account it or any
3676 required defs in the SLP children in the scalar cost. This
3677 way we make the vectorization more costly when compared to
3678 the scalar cost. */
3679 if (!STMT_VINFO_LIVE_P (stmt_info))
3680 {
3681 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3682 {
3683 imm_use_iterator use_iter;
3684 gimple *use_stmt;
3685 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3686 if (!is_gimple_debug (use_stmt))
3687 {
3688 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3689 if (!use_stmt_info
3690 || !PURE_SLP_STMT
3691 (vect_stmt_to_vectorize (use_stmt_info)))
3692 {
3693 (*life)[i] = true;
3694 BREAK_FROM_IMM_USE_STMT (use_iter);
3695 }
3696 }
3697 }
3698 if ((*life)[i])
3699 continue;
3700 }
3701
3702 /* Count scalar stmts only once. */
3703 if (gimple_visited_p (orig_stmt))
3704 continue;
3705 gimple_set_visited (orig_stmt, true);
3706
3707 vect_cost_for_stmt kind;
3708 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3709 {
3710 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3711 kind = scalar_load;
3712 else
3713 kind = scalar_store;
3714 }
3715 else if (vect_nop_conversion_p (orig_stmt_info))
3716 continue;
3717 else
3718 kind = scalar_stmt;
3719 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3720 }
3721
3722 auto_vec<bool, 20> subtree_life;
3723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3724 {
3725 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3726 {
3727 /* Do not directly pass LIFE to the recursive call, copy it to
3728 confine changes in the callee to the current child/subtree. */
3729 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3730 {
3731 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3732 for (unsigned j = 0;
3733 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3734 {
3735 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3736 if (perm.first == i)
3737 subtree_life[perm.second] = (*life)[j];
3738 }
3739 }
3740 else
3741 {
3742 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3743 subtree_life.safe_splice (*life);
3744 }
3745 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3746 visited);
3747 subtree_life.truncate (0);
3748 }
3749 }
3750 }
3751
3752 /* Check if vectorization of the basic block is profitable for the
3753 subgraph denoted by SLP_INSTANCES. */
3754
3755 static bool
3756 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3757 vec<slp_instance> slp_instances)
3758 {
3759 slp_instance instance;
3760 int i;
3761 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3762 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3763
3764 void *vect_target_cost_data = init_cost (NULL);
3765
3766 /* Calculate scalar cost and sum the cost for the vector stmts
3767 previously collected. */
3768 stmt_vector_for_cost scalar_costs;
3769 scalar_costs.create (0);
3770 hash_set<slp_tree> visited;
3771 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3772 {
3773 auto_vec<bool, 20> life;
3774 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3775 true);
3776 vect_bb_slp_scalar_cost (bb_vinfo,
3777 SLP_INSTANCE_TREE (instance),
3778 &life, &scalar_costs, visited);
3779 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3780 instance->cost_vec.release ();
3781 }
3782 /* Unset visited flag. */
3783 stmt_info_for_cost *si;
3784 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3785 gimple_set_visited (si->stmt_info->stmt, false);
3786
3787 void *scalar_target_cost_data = init_cost (NULL);
3788 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3789 scalar_costs.release ();
3790 unsigned dummy;
3791 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3792 destroy_cost_data (scalar_target_cost_data);
3793
3794 /* Complete the target-specific vector cost calculation. */
3795 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3796 &vec_inside_cost, &vec_epilogue_cost);
3797 destroy_cost_data (vect_target_cost_data);
3798
3799 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3800
3801 if (dump_enabled_p ())
3802 {
3803 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3804 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3805 vec_inside_cost);
3806 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3807 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3808 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3809 }
3810
3811 /* Vectorization is profitable if its cost is more than the cost of scalar
3812 version. Note that we err on the vector side for equal cost because
3813 the cost estimate is otherwise quite pessimistic (constant uses are
3814 free on the scalar side but cost a load on the vector side for
3815 example). */
3816 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3817 return false;
3818
3819 return true;
3820 }
3821
3822 /* Find any vectorizable constructors and add them to the grouped_store
3823 array. */
3824
3825 static void
3826 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3827 {
3828 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
3829 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
3830 !gsi_end_p (gsi); gsi_next (&gsi))
3831 {
3832 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
3833 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3834 continue;
3835
3836 tree rhs = gimple_assign_rhs1 (assign);
3837 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3838 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3839 CONSTRUCTOR_NELTS (rhs))
3840 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3841 || uniform_vector_p (rhs))
3842 continue;
3843
3844 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3845 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3846 }
3847 }
3848
3849 /* Check if the region described by BB_VINFO can be vectorized, returning
3850 true if so. When returning false, set FATAL to true if the same failure
3851 would prevent vectorization at other vector sizes, false if it is still
3852 worth trying other sizes. N_STMTS is the number of statements in the
3853 region. */
3854
3855 static bool
3856 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3857 vec<int> *dataref_groups)
3858 {
3859 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3860
3861 slp_instance instance;
3862 int i;
3863 poly_uint64 min_vf = 2;
3864
3865 /* The first group of checks is independent of the vector size. */
3866 fatal = true;
3867
3868 /* Analyze the data references. */
3869
3870 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3871 {
3872 if (dump_enabled_p ())
3873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3874 "not vectorized: unhandled data-ref in basic "
3875 "block.\n");
3876 return false;
3877 }
3878
3879 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3880 {
3881 if (dump_enabled_p ())
3882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3883 "not vectorized: unhandled data access in "
3884 "basic block.\n");
3885 return false;
3886 }
3887
3888 vect_slp_check_for_constructors (bb_vinfo);
3889
3890 /* If there are no grouped stores and no constructors in the region
3891 there is no need to continue with pattern recog as vect_analyze_slp
3892 will fail anyway. */
3893 if (bb_vinfo->grouped_stores.is_empty ())
3894 {
3895 if (dump_enabled_p ())
3896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3897 "not vectorized: no grouped stores in "
3898 "basic block.\n");
3899 return false;
3900 }
3901
3902 /* While the rest of the analysis below depends on it in some way. */
3903 fatal = false;
3904
3905 vect_pattern_recog (bb_vinfo);
3906
3907 /* Check the SLP opportunities in the basic block, analyze and build SLP
3908 trees. */
3909 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3910 {
3911 if (dump_enabled_p ())
3912 {
3913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3914 "Failed to SLP the basic block.\n");
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3916 "not vectorized: failed to find SLP opportunities "
3917 "in basic block.\n");
3918 }
3919 return false;
3920 }
3921
3922 /* Optimize permutations. */
3923 vect_optimize_slp (bb_vinfo);
3924
3925 vect_record_base_alignments (bb_vinfo);
3926
3927 /* Analyze and verify the alignment of data references and the
3928 dependence in the SLP instances. */
3929 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3930 {
3931 vect_location = instance->location ();
3932 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3933 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3934 {
3935 slp_tree node = SLP_INSTANCE_TREE (instance);
3936 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3937 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE, vect_location,
3939 "removing SLP instance operations starting from: %G",
3940 stmt_info->stmt);
3941 vect_free_slp_instance (instance);
3942 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3943 continue;
3944 }
3945
3946 /* Mark all the statements that we want to vectorize as pure SLP and
3947 relevant. */
3948 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3949 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3950 if (SLP_INSTANCE_ROOT_STMT (instance))
3951 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3952
3953 i++;
3954 }
3955 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3956 return false;
3957
3958 if (!vect_slp_analyze_operations (bb_vinfo))
3959 {
3960 if (dump_enabled_p ())
3961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962 "not vectorized: bad operation in basic block.\n");
3963 return false;
3964 }
3965
3966 vect_bb_partition_graph (bb_vinfo);
3967
3968 return true;
3969 }
3970
3971 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
3972 basic blocks in BBS, returning true on success.
3973 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
3974
3975 static bool
3976 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
3977 vec<int> *dataref_groups, unsigned int n_stmts)
3978 {
3979 bb_vec_info bb_vinfo;
3980 auto_vector_modes vector_modes;
3981
3982 /* Autodetect first vector size we try. */
3983 machine_mode next_vector_mode = VOIDmode;
3984 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3985 unsigned int mode_i = 0;
3986
3987 vec_info_shared shared;
3988
3989 machine_mode autodetected_vector_mode = VOIDmode;
3990 while (1)
3991 {
3992 bool vectorized = false;
3993 bool fatal = false;
3994 bb_vinfo = new _bb_vec_info (bbs, &shared);
3995
3996 bool first_time_p = shared.datarefs.is_empty ();
3997 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3998 if (first_time_p)
3999 bb_vinfo->shared->save_datarefs ();
4000 else
4001 bb_vinfo->shared->check_datarefs ();
4002 bb_vinfo->vector_mode = next_vector_mode;
4003
4004 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4005 && dbg_cnt (vect_slp))
4006 {
4007 if (dump_enabled_p ())
4008 {
4009 dump_printf_loc (MSG_NOTE, vect_location,
4010 "***** Analysis succeeded with vector mode"
4011 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4012 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4013 }
4014
4015 bb_vinfo->shared->check_datarefs ();
4016
4017 unsigned i;
4018 slp_instance instance;
4019 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4020 {
4021 if (instance->subgraph_entries.is_empty ())
4022 continue;
4023
4024 vect_location = instance->location ();
4025 if (!unlimited_cost_model (NULL)
4026 && !vect_bb_vectorization_profitable_p
4027 (bb_vinfo, instance->subgraph_entries))
4028 {
4029 if (dump_enabled_p ())
4030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4031 "not vectorized: vectorization is not "
4032 "profitable.\n");
4033 continue;
4034 }
4035
4036 if (!vectorized && dump_enabled_p ())
4037 dump_printf_loc (MSG_NOTE, vect_location,
4038 "Basic block will be vectorized "
4039 "using SLP\n");
4040 vectorized = true;
4041
4042 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4043
4044 unsigned HOST_WIDE_INT bytes;
4045 if (dump_enabled_p ())
4046 {
4047 if (GET_MODE_SIZE
4048 (bb_vinfo->vector_mode).is_constant (&bytes))
4049 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4050 "basic block part vectorized using %wu "
4051 "byte vectors\n", bytes);
4052 else
4053 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4054 "basic block part vectorized using "
4055 "variable length vectors\n");
4056 }
4057 }
4058 }
4059 else
4060 {
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE, vect_location,
4063 "***** Analysis failed with vector mode %s\n",
4064 GET_MODE_NAME (bb_vinfo->vector_mode));
4065 }
4066
4067 if (mode_i == 0)
4068 autodetected_vector_mode = bb_vinfo->vector_mode;
4069
4070 if (!fatal)
4071 while (mode_i < vector_modes.length ()
4072 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4073 {
4074 if (dump_enabled_p ())
4075 dump_printf_loc (MSG_NOTE, vect_location,
4076 "***** The result for vector mode %s would"
4077 " be the same\n",
4078 GET_MODE_NAME (vector_modes[mode_i]));
4079 mode_i += 1;
4080 }
4081
4082 delete bb_vinfo;
4083
4084 if (mode_i < vector_modes.length ()
4085 && VECTOR_MODE_P (autodetected_vector_mode)
4086 && (related_vector_mode (vector_modes[mode_i],
4087 GET_MODE_INNER (autodetected_vector_mode))
4088 == autodetected_vector_mode)
4089 && (related_vector_mode (autodetected_vector_mode,
4090 GET_MODE_INNER (vector_modes[mode_i]))
4091 == vector_modes[mode_i]))
4092 {
4093 if (dump_enabled_p ())
4094 dump_printf_loc (MSG_NOTE, vect_location,
4095 "***** Skipping vector mode %s, which would"
4096 " repeat the analysis for %s\n",
4097 GET_MODE_NAME (vector_modes[mode_i]),
4098 GET_MODE_NAME (autodetected_vector_mode));
4099 mode_i += 1;
4100 }
4101
4102 if (vectorized
4103 || mode_i == vector_modes.length ()
4104 || autodetected_vector_mode == VOIDmode
4105 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4106 vector sizes will fail do not bother iterating. */
4107 || fatal)
4108 return vectorized;
4109
4110 /* Try the next biggest vector size. */
4111 next_vector_mode = vector_modes[mode_i++];
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_NOTE, vect_location,
4114 "***** Re-trying analysis with vector mode %s\n",
4115 GET_MODE_NAME (next_vector_mode));
4116 }
4117 }
4118
4119
4120 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4121 true if anything in the basic-block was vectorized. */
4122
4123 static bool
4124 vect_slp_bbs (vec<basic_block> bbs)
4125 {
4126 vec<data_reference_p> datarefs = vNULL;
4127 auto_vec<int> dataref_groups;
4128 int insns = 0;
4129 int current_group = 0;
4130
4131 for (unsigned i = 0; i < bbs.length (); i++)
4132 {
4133 basic_block bb = bbs[i];
4134 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4135 gsi_next (&gsi))
4136 {
4137 gimple *stmt = gsi_stmt (gsi);
4138 if (is_gimple_debug (stmt))
4139 continue;
4140
4141 insns++;
4142
4143 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4144 vect_location = stmt;
4145
4146 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4147 &dataref_groups, current_group))
4148 ++current_group;
4149
4150 if (insns > param_slp_max_insns_in_bb)
4151 {
4152 if (dump_enabled_p ())
4153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4154 "not vectorized: too many instructions in "
4155 "region.\n");
4156 }
4157 }
4158 }
4159
4160 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4161 }
4162
4163 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4164 true if anything in the basic-block was vectorized. */
4165
4166 bool
4167 vect_slp_bb (basic_block bb)
4168 {
4169 auto_vec<basic_block> bbs;
4170 bbs.safe_push (bb);
4171 return vect_slp_bbs (bbs);
4172 }
4173
4174 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4175 true if anything in the basic-block was vectorized. */
4176
4177 bool
4178 vect_slp_function (function *fun)
4179 {
4180 bool r = false;
4181 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4182 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4183
4184 /* For the moment split the function into pieces to avoid making
4185 the iteration on the vector mode moot. Split at points we know
4186 to not handle well which is CFG merges (SLP discovery doesn't
4187 handle non-loop-header PHIs) and loop exits. Since pattern
4188 recog requires reverse iteration to visit uses before defs
4189 simply chop RPO into pieces. */
4190 auto_vec<basic_block> bbs;
4191 for (unsigned i = 0; i < n; i++)
4192 {
4193 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4194
4195 /* Split when a basic block has multiple predecessors or when the
4196 edge into it exits a loop (because of implementation issues with
4197 respect to placement of CTORs for externals). */
4198 bool split = false;
4199 edge e;
4200 if (!single_pred_p (bb)
4201 || ((e = single_pred_edge (bb)),
4202 loop_exit_edge_p (e->src->loop_father, e)))
4203 split = true;
4204 /* Split when a BB is not dominated by the first block. */
4205 else if (!bbs.is_empty ()
4206 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4207 split = true;
4208
4209 if (split && !bbs.is_empty ())
4210 {
4211 r |= vect_slp_bbs (bbs);
4212 bbs.truncate (0);
4213 bbs.quick_push (bb);
4214 }
4215 else
4216 bbs.safe_push (bb);
4217 }
4218
4219 if (!bbs.is_empty ())
4220 r |= vect_slp_bbs (bbs);
4221
4222 free (rpo);
4223
4224 return r;
4225 }
4226
4227 /* Build a variable-length vector in which the elements in ELTS are repeated
4228 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4229 RESULTS and add any new instructions to SEQ.
4230
4231 The approach we use is:
4232
4233 (1) Find a vector mode VM with integer elements of mode IM.
4234
4235 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4236 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4237 from small vectors to IM.
4238
4239 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4240
4241 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4242 correct byte contents.
4243
4244 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4245
4246 We try to find the largest IM for which this sequence works, in order
4247 to cut down on the number of interleaves. */
4248
4249 void
4250 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4251 vec<tree> elts, unsigned int nresults,
4252 vec<tree> &results)
4253 {
4254 unsigned int nelts = elts.length ();
4255 tree element_type = TREE_TYPE (vector_type);
4256
4257 /* (1) Find a vector mode VM with integer elements of mode IM. */
4258 unsigned int nvectors = 1;
4259 tree new_vector_type;
4260 tree permutes[2];
4261 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4262 &nvectors, &new_vector_type,
4263 permutes))
4264 gcc_unreachable ();
4265
4266 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4267 unsigned int partial_nelts = nelts / nvectors;
4268 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4269
4270 tree_vector_builder partial_elts;
4271 auto_vec<tree, 32> pieces (nvectors * 2);
4272 pieces.quick_grow (nvectors * 2);
4273 for (unsigned int i = 0; i < nvectors; ++i)
4274 {
4275 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4276 ELTS' has mode IM. */
4277 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4278 for (unsigned int j = 0; j < partial_nelts; ++j)
4279 partial_elts.quick_push (elts[i * partial_nelts + j]);
4280 tree t = gimple_build_vector (seq, &partial_elts);
4281 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4282 TREE_TYPE (new_vector_type), t);
4283
4284 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4285 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4286 }
4287
4288 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4289 correct byte contents.
4290
4291 We need to repeat the following operation log2(nvectors) times:
4292
4293 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4294 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4295
4296 However, if each input repeats every N elements and the VF is
4297 a multiple of N * 2, the HI result is the same as the LO. */
4298 unsigned int in_start = 0;
4299 unsigned int out_start = nvectors;
4300 unsigned int hi_start = nvectors / 2;
4301 /* A bound on the number of outputs needed to produce NRESULTS results
4302 in the final iteration. */
4303 unsigned int noutputs_bound = nvectors * nresults;
4304 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4305 {
4306 noutputs_bound /= 2;
4307 unsigned int limit = MIN (noutputs_bound, nvectors);
4308 for (unsigned int i = 0; i < limit; ++i)
4309 {
4310 if ((i & 1) != 0
4311 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4312 2 * in_repeat))
4313 {
4314 pieces[out_start + i] = pieces[out_start + i - 1];
4315 continue;
4316 }
4317
4318 tree output = make_ssa_name (new_vector_type);
4319 tree input1 = pieces[in_start + (i / 2)];
4320 tree input2 = pieces[in_start + (i / 2) + hi_start];
4321 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4322 input1, input2,
4323 permutes[i & 1]);
4324 gimple_seq_add_stmt (seq, stmt);
4325 pieces[out_start + i] = output;
4326 }
4327 std::swap (in_start, out_start);
4328 }
4329
4330 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4331 results.reserve (nresults);
4332 for (unsigned int i = 0; i < nresults; ++i)
4333 if (i < nvectors)
4334 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4335 pieces[in_start + i]));
4336 else
4337 results.quick_push (results[i - nvectors]);
4338 }
4339
4340
4341 /* For constant and loop invariant defs in OP_NODE this function creates
4342 vector defs that will be used in the vectorized stmts and stores them
4343 to SLP_TREE_VEC_DEFS of OP_NODE. */
4344
4345 static void
4346 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4347 {
4348 unsigned HOST_WIDE_INT nunits;
4349 tree vec_cst;
4350 unsigned j, number_of_places_left_in_vector;
4351 tree vector_type;
4352 tree vop;
4353 int group_size = op_node->ops.length ();
4354 unsigned int vec_num, i;
4355 unsigned number_of_copies = 1;
4356 bool constant_p;
4357 gimple_seq ctor_seq = NULL;
4358 auto_vec<tree, 16> permute_results;
4359
4360 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4361 vector_type = SLP_TREE_VECTYPE (op_node);
4362
4363 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4364 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4365 auto_vec<tree> voprnds (number_of_vectors);
4366
4367 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4368 created vectors. It is greater than 1 if unrolling is performed.
4369
4370 For example, we have two scalar operands, s1 and s2 (e.g., group of
4371 strided accesses of size two), while NUNITS is four (i.e., four scalars
4372 of this type can be packed in a vector). The output vector will contain
4373 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4374 will be 2).
4375
4376 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4377 containing the operands.
4378
4379 For example, NUNITS is four as before, and the group size is 8
4380 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4381 {s5, s6, s7, s8}. */
4382
4383 /* When using duplicate_and_interleave, we just need one element for
4384 each scalar statement. */
4385 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4386 nunits = group_size;
4387
4388 number_of_copies = nunits * number_of_vectors / group_size;
4389
4390 number_of_places_left_in_vector = nunits;
4391 constant_p = true;
4392 tree_vector_builder elts (vector_type, nunits, 1);
4393 elts.quick_grow (nunits);
4394 stmt_vec_info insert_after = NULL;
4395 for (j = 0; j < number_of_copies; j++)
4396 {
4397 tree op;
4398 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4399 {
4400 /* Create 'vect_ = {op0,op1,...,opn}'. */
4401 number_of_places_left_in_vector--;
4402 tree orig_op = op;
4403 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4404 {
4405 if (CONSTANT_CLASS_P (op))
4406 {
4407 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4408 {
4409 /* Can't use VIEW_CONVERT_EXPR for booleans because
4410 of possibly different sizes of scalar value and
4411 vector element. */
4412 if (integer_zerop (op))
4413 op = build_int_cst (TREE_TYPE (vector_type), 0);
4414 else if (integer_onep (op))
4415 op = build_all_ones_cst (TREE_TYPE (vector_type));
4416 else
4417 gcc_unreachable ();
4418 }
4419 else
4420 op = fold_unary (VIEW_CONVERT_EXPR,
4421 TREE_TYPE (vector_type), op);
4422 gcc_assert (op && CONSTANT_CLASS_P (op));
4423 }
4424 else
4425 {
4426 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4427 gimple *init_stmt;
4428 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4429 {
4430 tree true_val
4431 = build_all_ones_cst (TREE_TYPE (vector_type));
4432 tree false_val
4433 = build_zero_cst (TREE_TYPE (vector_type));
4434 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4435 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4436 op, true_val,
4437 false_val);
4438 }
4439 else
4440 {
4441 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4442 op);
4443 init_stmt
4444 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4445 op);
4446 }
4447 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4448 op = new_temp;
4449 }
4450 }
4451 elts[number_of_places_left_in_vector] = op;
4452 if (!CONSTANT_CLASS_P (op))
4453 constant_p = false;
4454 /* For BB vectorization we have to compute an insert location
4455 when a def is inside the analyzed region since we cannot
4456 simply insert at the BB start in this case. */
4457 stmt_vec_info opdef;
4458 if (TREE_CODE (orig_op) == SSA_NAME
4459 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4460 && is_a <bb_vec_info> (vinfo)
4461 && (opdef = vinfo->lookup_def (orig_op)))
4462 {
4463 if (!insert_after)
4464 insert_after = opdef;
4465 else
4466 insert_after = get_later_stmt (insert_after, opdef);
4467 }
4468
4469 if (number_of_places_left_in_vector == 0)
4470 {
4471 if (constant_p
4472 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4473 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4474 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4475 else
4476 {
4477 if (permute_results.is_empty ())
4478 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4479 elts, number_of_vectors,
4480 permute_results);
4481 vec_cst = permute_results[number_of_vectors - j - 1];
4482 }
4483 if (!gimple_seq_empty_p (ctor_seq))
4484 {
4485 if (insert_after)
4486 {
4487 gimple_stmt_iterator gsi;
4488 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4489 {
4490 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4491 gsi_insert_seq_before (&gsi, ctor_seq,
4492 GSI_CONTINUE_LINKING);
4493 }
4494 else if (!stmt_ends_bb_p (insert_after->stmt))
4495 {
4496 gsi = gsi_for_stmt (insert_after->stmt);
4497 gsi_insert_seq_after (&gsi, ctor_seq,
4498 GSI_CONTINUE_LINKING);
4499 }
4500 else
4501 {
4502 /* When we want to insert after a def where the
4503 defining stmt throws then insert on the fallthru
4504 edge. */
4505 edge e = find_fallthru_edge
4506 (gimple_bb (insert_after->stmt)->succs);
4507 basic_block new_bb
4508 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4509 gcc_assert (!new_bb);
4510 }
4511 }
4512 else
4513 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4514 ctor_seq = NULL;
4515 }
4516 voprnds.quick_push (vec_cst);
4517 insert_after = NULL;
4518 number_of_places_left_in_vector = nunits;
4519 constant_p = true;
4520 elts.new_vector (vector_type, nunits, 1);
4521 elts.quick_grow (nunits);
4522 }
4523 }
4524 }
4525
4526 /* Since the vectors are created in the reverse order, we should invert
4527 them. */
4528 vec_num = voprnds.length ();
4529 for (j = vec_num; j != 0; j--)
4530 {
4531 vop = voprnds[j - 1];
4532 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4533 }
4534
4535 /* In case that VF is greater than the unrolling factor needed for the SLP
4536 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4537 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4538 to replicate the vectors. */
4539 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4540 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4541 i++)
4542 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4543 }
4544
4545 /* Get the Ith vectorized definition from SLP_NODE. */
4546
4547 tree
4548 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4549 {
4550 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4551 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4552 else
4553 return SLP_TREE_VEC_DEFS (slp_node)[i];
4554 }
4555
4556 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4557
4558 void
4559 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4560 {
4561 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4562 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4563 {
4564 unsigned j;
4565 gimple *vec_def_stmt;
4566 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4567 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4568 }
4569 else
4570 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4571 }
4572
4573 /* Get N vectorized definitions for SLP_NODE. */
4574
4575 void
4576 vect_get_slp_defs (vec_info *,
4577 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4578 {
4579 if (n == -1U)
4580 n = SLP_TREE_CHILDREN (slp_node).length ();
4581
4582 for (unsigned i = 0; i < n; ++i)
4583 {
4584 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4585 vec<tree> vec_defs = vNULL;
4586 vect_get_slp_defs (child, &vec_defs);
4587 vec_oprnds->quick_push (vec_defs);
4588 }
4589 }
4590
4591 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4592 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4593 permute statements for the SLP node NODE. */
4594
4595 bool
4596 vect_transform_slp_perm_load (vec_info *vinfo,
4597 slp_tree node, vec<tree> dr_chain,
4598 gimple_stmt_iterator *gsi, poly_uint64 vf,
4599 bool analyze_only, unsigned *n_perms)
4600 {
4601 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4602 int vec_index = 0;
4603 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4604 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4605 unsigned int mask_element;
4606 machine_mode mode;
4607
4608 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4609 return false;
4610
4611 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4612
4613 mode = TYPE_MODE (vectype);
4614 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4615
4616 /* Initialize the vect stmts of NODE to properly insert the generated
4617 stmts later. */
4618 if (! analyze_only)
4619 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4620 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4621 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4622
4623 /* Generate permutation masks for every NODE. Number of masks for each NODE
4624 is equal to GROUP_SIZE.
4625 E.g., we have a group of three nodes with three loads from the same
4626 location in each node, and the vector size is 4. I.e., we have a
4627 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4628 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4629 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4630 ...
4631
4632 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4633 The last mask is illegal since we assume two operands for permute
4634 operation, and the mask element values can't be outside that range.
4635 Hence, the last mask must be converted into {2,5,5,5}.
4636 For the first two permutations we need the first and the second input
4637 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4638 we need the second and the third vectors: {b1,c1,a2,b2} and
4639 {c2,a3,b3,c3}. */
4640
4641 int vect_stmts_counter = 0;
4642 unsigned int index = 0;
4643 int first_vec_index = -1;
4644 int second_vec_index = -1;
4645 bool noop_p = true;
4646 *n_perms = 0;
4647
4648 vec_perm_builder mask;
4649 unsigned int nelts_to_build;
4650 unsigned int nvectors_per_build;
4651 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4652 && multiple_p (nunits, group_size));
4653 if (repeating_p)
4654 {
4655 /* A single vector contains a whole number of copies of the node, so:
4656 (a) all permutes can use the same mask; and
4657 (b) the permutes only need a single vector input. */
4658 mask.new_vector (nunits, group_size, 3);
4659 nelts_to_build = mask.encoded_nelts ();
4660 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4661 }
4662 else
4663 {
4664 /* We need to construct a separate mask for each vector statement. */
4665 unsigned HOST_WIDE_INT const_nunits, const_vf;
4666 if (!nunits.is_constant (&const_nunits)
4667 || !vf.is_constant (&const_vf))
4668 return false;
4669 mask.new_vector (const_nunits, const_nunits, 1);
4670 nelts_to_build = const_vf * group_size;
4671 nvectors_per_build = 1;
4672 }
4673
4674 unsigned int count = mask.encoded_nelts ();
4675 mask.quick_grow (count);
4676 vec_perm_indices indices;
4677
4678 for (unsigned int j = 0; j < nelts_to_build; j++)
4679 {
4680 unsigned int iter_num = j / group_size;
4681 unsigned int stmt_num = j % group_size;
4682 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4683 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4684 if (repeating_p)
4685 {
4686 first_vec_index = 0;
4687 mask_element = i;
4688 }
4689 else
4690 {
4691 /* Enforced before the loop when !repeating_p. */
4692 unsigned int const_nunits = nunits.to_constant ();
4693 vec_index = i / const_nunits;
4694 mask_element = i % const_nunits;
4695 if (vec_index == first_vec_index
4696 || first_vec_index == -1)
4697 {
4698 first_vec_index = vec_index;
4699 }
4700 else if (vec_index == second_vec_index
4701 || second_vec_index == -1)
4702 {
4703 second_vec_index = vec_index;
4704 mask_element += const_nunits;
4705 }
4706 else
4707 {
4708 if (dump_enabled_p ())
4709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4710 "permutation requires at "
4711 "least three vectors %G",
4712 stmt_info->stmt);
4713 gcc_assert (analyze_only);
4714 return false;
4715 }
4716
4717 gcc_assert (mask_element < 2 * const_nunits);
4718 }
4719
4720 if (mask_element != index)
4721 noop_p = false;
4722 mask[index++] = mask_element;
4723
4724 if (index == count && !noop_p)
4725 {
4726 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4727 if (!can_vec_perm_const_p (mode, indices))
4728 {
4729 if (dump_enabled_p ())
4730 {
4731 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4732 vect_location,
4733 "unsupported vect permute { ");
4734 for (i = 0; i < count; ++i)
4735 {
4736 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4737 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4738 }
4739 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4740 }
4741 gcc_assert (analyze_only);
4742 return false;
4743 }
4744
4745 ++*n_perms;
4746 }
4747
4748 if (index == count)
4749 {
4750 if (!analyze_only)
4751 {
4752 tree mask_vec = NULL_TREE;
4753
4754 if (! noop_p)
4755 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4756
4757 if (second_vec_index == -1)
4758 second_vec_index = first_vec_index;
4759
4760 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4761 {
4762 /* Generate the permute statement if necessary. */
4763 tree first_vec = dr_chain[first_vec_index + ri];
4764 tree second_vec = dr_chain[second_vec_index + ri];
4765 gimple *perm_stmt;
4766 if (! noop_p)
4767 {
4768 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4769 tree perm_dest
4770 = vect_create_destination_var (gimple_assign_lhs (stmt),
4771 vectype);
4772 perm_dest = make_ssa_name (perm_dest);
4773 perm_stmt
4774 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4775 first_vec, second_vec,
4776 mask_vec);
4777 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4778 gsi);
4779 }
4780 else
4781 /* If mask was NULL_TREE generate the requested
4782 identity transform. */
4783 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4784
4785 /* Store the vector statement in NODE. */
4786 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4787 }
4788 }
4789
4790 index = 0;
4791 first_vec_index = -1;
4792 second_vec_index = -1;
4793 noop_p = true;
4794 }
4795 }
4796
4797 return true;
4798 }
4799
4800
4801 /* Vectorize the SLP permutations in NODE as specified
4802 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4803 child number and lane number.
4804 Interleaving of two two-lane two-child SLP subtrees (not supported):
4805 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4806 A blend of two four-lane two-child SLP subtrees:
4807 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4808 Highpart of a four-lane one-child SLP subtree (not supported):
4809 [ { 0, 2 }, { 0, 3 } ]
4810 Where currently only a subset is supported by code generating below. */
4811
4812 static bool
4813 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4814 slp_tree node, stmt_vector_for_cost *cost_vec)
4815 {
4816 tree vectype = SLP_TREE_VECTYPE (node);
4817
4818 /* ??? We currently only support all same vector input and output types
4819 while the SLP IL should really do a concat + select and thus accept
4820 arbitrary mismatches. */
4821 slp_tree child;
4822 unsigned i;
4823 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4824 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4825 {
4826 if (dump_enabled_p ())
4827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4828 "Unsupported lane permutation\n");
4829 return false;
4830 }
4831
4832 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4833 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4834 if (dump_enabled_p ())
4835 {
4836 dump_printf_loc (MSG_NOTE, vect_location,
4837 "vectorizing permutation");
4838 for (unsigned i = 0; i < perm.length (); ++i)
4839 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4840 dump_printf (MSG_NOTE, "\n");
4841 }
4842
4843 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4844 if (!nunits.is_constant ())
4845 return false;
4846 unsigned HOST_WIDE_INT vf = 1;
4847 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4848 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4849 return false;
4850 unsigned olanes = vf * SLP_TREE_LANES (node);
4851 gcc_assert (multiple_p (olanes, nunits));
4852
4853 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4854 from the { SLP operand, scalar lane } permutation as recorded in the
4855 SLP node as intermediate step. This part should already work
4856 with SLP children with arbitrary number of lanes. */
4857 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4858 auto_vec<unsigned> active_lane;
4859 vperm.create (olanes);
4860 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
4861 for (unsigned i = 0; i < vf; ++i)
4862 {
4863 for (unsigned pi = 0; pi < perm.length (); ++pi)
4864 {
4865 std::pair<unsigned, unsigned> p = perm[pi];
4866 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4867 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4868 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4869 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4870 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4871 }
4872 /* Advance to the next group. */
4873 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4874 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4875 }
4876
4877 if (dump_enabled_p ())
4878 {
4879 dump_printf_loc (MSG_NOTE, vect_location, "as");
4880 for (unsigned i = 0; i < vperm.length (); ++i)
4881 {
4882 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4883 dump_printf (MSG_NOTE, ",");
4884 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4885 vperm[i].first.first, vperm[i].first.second,
4886 vperm[i].first.second);
4887 }
4888 dump_printf (MSG_NOTE, "\n");
4889 }
4890
4891 /* We can only handle two-vector permutes, everything else should
4892 be lowered on the SLP level. The following is closely inspired
4893 by vect_transform_slp_perm_load and is supposed to eventually
4894 replace it.
4895 ??? As intermediate step do code-gen in the SLP tree representation
4896 somehow? */
4897 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4898 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4899 unsigned int const_nunits = nunits.to_constant ();
4900 unsigned int index = 0;
4901 unsigned int mask_element;
4902 vec_perm_builder mask;
4903 mask.new_vector (const_nunits, const_nunits, 1);
4904 unsigned int count = mask.encoded_nelts ();
4905 mask.quick_grow (count);
4906 vec_perm_indices indices;
4907 unsigned nperms = 0;
4908 for (unsigned i = 0; i < vperm.length (); ++i)
4909 {
4910 mask_element = vperm[i].second;
4911 if (first_vec.first == -1U
4912 || first_vec == vperm[i].first)
4913 first_vec = vperm[i].first;
4914 else if (second_vec.first == -1U
4915 || second_vec == vperm[i].first)
4916 {
4917 second_vec = vperm[i].first;
4918 mask_element += const_nunits;
4919 }
4920 else
4921 {
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4924 "permutation requires at "
4925 "least three vectors");
4926 gcc_assert (!gsi);
4927 return false;
4928 }
4929
4930 mask[index++] = mask_element;
4931
4932 if (index == count)
4933 {
4934 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4935 const_nunits);
4936 bool identity_p = indices.series_p (0, 1, 0, 1);
4937 if (!identity_p
4938 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4939 {
4940 if (dump_enabled_p ())
4941 {
4942 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4943 vect_location,
4944 "unsupported vect permute { ");
4945 for (i = 0; i < count; ++i)
4946 {
4947 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4948 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4949 }
4950 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4951 }
4952 gcc_assert (!gsi);
4953 return false;
4954 }
4955
4956 if (!identity_p)
4957 nperms++;
4958 if (gsi)
4959 {
4960 if (second_vec.first == -1U)
4961 second_vec = first_vec;
4962
4963 /* Generate the permute statement if necessary. */
4964 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4965 tree first_def
4966 = vect_get_slp_vect_def (first_node, first_vec.second);
4967 gassign *perm_stmt;
4968 tree perm_dest = make_ssa_name (vectype);
4969 if (!identity_p)
4970 {
4971 slp_tree second_node
4972 = SLP_TREE_CHILDREN (node)[second_vec.first];
4973 tree second_def
4974 = vect_get_slp_vect_def (second_node, second_vec.second);
4975 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4976 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4977 first_def, second_def,
4978 mask_vec);
4979 }
4980 else
4981 /* We need a copy here in case the def was external. */
4982 perm_stmt = gimple_build_assign (perm_dest, first_def);
4983 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4984 /* Store the vector statement in NODE. */
4985 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4986 }
4987
4988 index = 0;
4989 first_vec = std::make_pair (-1U, -1U);
4990 second_vec = std::make_pair (-1U, -1U);
4991 }
4992 }
4993
4994 if (!gsi)
4995 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4996
4997 return true;
4998 }
4999
5000 /* Vectorize SLP instance tree in postorder. */
5001
5002 static void
5003 vect_schedule_slp_instance (vec_info *vinfo,
5004 slp_tree node, slp_instance instance,
5005 hash_set<slp_tree> &visited)
5006 {
5007 gimple_stmt_iterator si;
5008 int i;
5009 slp_tree child;
5010
5011 /* See if we have already vectorized the node in the graph of the
5012 SLP instance. */
5013 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
5014 && SLP_TREE_VEC_STMTS (node).exists ())
5015 || SLP_TREE_VEC_DEFS (node).exists ())
5016 return;
5017
5018 /* Vectorize externals and constants. */
5019 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5020 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5021 {
5022 /* ??? vectorizable_shift can end up using a scalar operand which is
5023 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5024 node in this case. */
5025 if (!SLP_TREE_VECTYPE (node))
5026 return;
5027
5028 vect_create_constant_vectors (vinfo, node);
5029 return;
5030 }
5031
5032 /* ??? If we'd have a way to mark backedges that would be cheaper. */
5033 if (visited.add (node))
5034 return;
5035
5036 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5037 vect_schedule_slp_instance (vinfo, child, instance, visited);
5038
5039 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5040 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5041
5042 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5043 if (dump_enabled_p ())
5044 dump_printf_loc (MSG_NOTE, vect_location,
5045 "------>vectorizing SLP node starting from: %G",
5046 stmt_info->stmt);
5047
5048 if (STMT_VINFO_DATA_REF (stmt_info)
5049 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5050 {
5051 /* Vectorized loads go before the first scalar load to make it
5052 ready early, vectorized stores go before the last scalar
5053 stmt which is where all uses are ready. */
5054 stmt_vec_info last_stmt_info = NULL;
5055 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5056 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5057 else /* DR_IS_WRITE */
5058 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5059 si = gsi_for_stmt (last_stmt_info->stmt);
5060 }
5061 else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5062 == cycle_phi_info_type)
5063 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
5064 == induc_vec_info_type))
5065 {
5066 /* For reduction and induction PHIs we do not use the
5067 insertion iterator. */
5068 si = gsi_none ();
5069 }
5070 else
5071 {
5072 /* Emit other stmts after the children vectorized defs which is
5073 earliest possible. */
5074 gimple *last_stmt = NULL;
5075 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5076 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5077 {
5078 /* For fold-left reductions we are retaining the scalar
5079 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5080 set so the representation isn't perfect. Resort to the
5081 last scalar def here. */
5082 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5083 {
5084 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5085 == cycle_phi_info_type);
5086 gphi *phi = as_a <gphi *>
5087 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5088 if (!last_stmt
5089 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5090 last_stmt = phi;
5091 }
5092 /* We are emitting all vectorized stmts in the same place and
5093 the last one is the last.
5094 ??? Unless we have a load permutation applied and that
5095 figures to re-use an earlier generated load. */
5096 unsigned j;
5097 gimple *vstmt;
5098 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5099 if (!last_stmt
5100 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5101 last_stmt = vstmt;
5102 }
5103 else if (!SLP_TREE_VECTYPE (child))
5104 {
5105 /* For externals we use unvectorized at all scalar defs. */
5106 unsigned j;
5107 tree def;
5108 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5109 if (TREE_CODE (def) == SSA_NAME
5110 && !SSA_NAME_IS_DEFAULT_DEF (def))
5111 {
5112 gimple *stmt = SSA_NAME_DEF_STMT (def);
5113 if (!last_stmt
5114 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5115 last_stmt = stmt;
5116 }
5117 }
5118 else
5119 {
5120 /* For externals we have to look at all defs since their
5121 insertion place is decided per vector. But beware
5122 of pre-existing vectors where we need to make sure
5123 we do not insert before the region boundary. */
5124 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5125 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5126 last_stmt = gsi_stmt (gsi_after_labels
5127 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5128 else
5129 {
5130 unsigned j;
5131 tree vdef;
5132 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5133 if (TREE_CODE (vdef) == SSA_NAME
5134 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5135 {
5136 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5137 if (!last_stmt
5138 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5139 last_stmt = vstmt;
5140 }
5141 }
5142 }
5143 /* This can happen when all children are pre-existing vectors or
5144 constants. */
5145 if (!last_stmt)
5146 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5147 if (is_a <gphi *> (last_stmt))
5148 si = gsi_after_labels (gimple_bb (last_stmt));
5149 else
5150 {
5151 si = gsi_for_stmt (last_stmt);
5152 gsi_next (&si);
5153 }
5154 }
5155
5156 bool done_p = false;
5157
5158 /* Handle purely internal nodes. */
5159 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5160 {
5161 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5162 be shared with different SLP nodes (but usually it's the same
5163 operation apart from the case the stmt is only there for denoting
5164 the actual scalar lane defs ...). So do not call vect_transform_stmt
5165 but open-code it here (partly). */
5166 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5167 gcc_assert (done);
5168 done_p = true;
5169 }
5170 if (!done_p)
5171 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5172 }
5173
5174 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5175 For loop vectorization this is done in vectorizable_call, but for SLP
5176 it needs to be deferred until end of vect_schedule_slp, because multiple
5177 SLP instances may refer to the same scalar stmt. */
5178
5179 static void
5180 vect_remove_slp_scalar_calls (vec_info *vinfo,
5181 slp_tree node, hash_set<slp_tree> &visited)
5182 {
5183 gimple *new_stmt;
5184 gimple_stmt_iterator gsi;
5185 int i;
5186 slp_tree child;
5187 tree lhs;
5188 stmt_vec_info stmt_info;
5189
5190 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5191 return;
5192
5193 if (visited.add (node))
5194 return;
5195
5196 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5197 vect_remove_slp_scalar_calls (vinfo, child, visited);
5198
5199 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5200 {
5201 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5202 if (!stmt || gimple_bb (stmt) == NULL)
5203 continue;
5204 if (is_pattern_stmt_p (stmt_info)
5205 || !PURE_SLP_STMT (stmt_info))
5206 continue;
5207 lhs = gimple_call_lhs (stmt);
5208 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5209 gsi = gsi_for_stmt (stmt);
5210 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5211 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5212 }
5213 }
5214
5215 static void
5216 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5217 {
5218 hash_set<slp_tree> visited;
5219 vect_remove_slp_scalar_calls (vinfo, node, visited);
5220 }
5221
5222 /* Vectorize the instance root. */
5223
5224 void
5225 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5226 {
5227 gassign *rstmt = NULL;
5228
5229 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5230 {
5231 gimple *child_stmt;
5232 int j;
5233
5234 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5235 {
5236 tree vect_lhs = gimple_get_lhs (child_stmt);
5237 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5238 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5239 TREE_TYPE (vect_lhs)))
5240 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5241 vect_lhs);
5242 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5243 break;
5244 }
5245 }
5246 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5247 {
5248 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5249 gimple *child_stmt;
5250 int j;
5251 vec<constructor_elt, va_gc> *v;
5252 vec_alloc (v, nelts);
5253
5254 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5255 {
5256 CONSTRUCTOR_APPEND_ELT (v,
5257 NULL_TREE,
5258 gimple_get_lhs (child_stmt));
5259 }
5260 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5261 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5262 tree r_constructor = build_constructor (rtype, v);
5263 rstmt = gimple_build_assign (lhs, r_constructor);
5264 }
5265
5266 gcc_assert (rstmt);
5267
5268 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5269 gsi_replace (&rgsi, rstmt, true);
5270 }
5271
5272 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5273
5274 void
5275 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5276 {
5277 slp_instance instance;
5278 unsigned int i;
5279
5280 hash_set<slp_tree> visited;
5281 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5282 {
5283 slp_tree node = SLP_INSTANCE_TREE (instance);
5284 if (dump_enabled_p ())
5285 {
5286 dump_printf_loc (MSG_NOTE, vect_location,
5287 "Vectorizing SLP tree:\n");
5288 if (SLP_INSTANCE_ROOT_STMT (instance))
5289 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5290 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5291 vect_print_slp_graph (MSG_NOTE, vect_location,
5292 SLP_INSTANCE_TREE (instance));
5293 }
5294 /* Schedule the tree of INSTANCE. */
5295 vect_schedule_slp_instance (vinfo, node, instance, visited);
5296
5297 if (SLP_INSTANCE_ROOT_STMT (instance))
5298 vectorize_slp_instance_root_stmt (node, instance);
5299
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_NOTE, vect_location,
5302 "vectorizing stmts using SLP.\n");
5303 }
5304
5305 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5306 {
5307 slp_tree root = SLP_INSTANCE_TREE (instance);
5308 stmt_vec_info store_info;
5309 unsigned int j;
5310
5311 /* For reductions set the latch values of the vectorized PHIs. */
5312 if (instance->reduc_phis
5313 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5314 (instance->reduc_phis)) != FOLD_LEFT_REDUCTION
5315 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
5316 (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION)
5317 {
5318 slp_tree slp_node = root;
5319 slp_tree phi_node = instance->reduc_phis;
5320 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
5321 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
5322 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
5323 == SLP_TREE_VEC_STMTS (slp_node).length ());
5324 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
5325 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
5326 vect_get_slp_vect_def (slp_node, j),
5327 e, gimple_phi_arg_location (phi, e->dest_idx));
5328 }
5329
5330 /* Remove scalar call stmts. Do not do this for basic-block
5331 vectorization as not all uses may be vectorized.
5332 ??? Why should this be necessary? DCE should be able to
5333 remove the stmts itself.
5334 ??? For BB vectorization we can as well remove scalar
5335 stmts starting from the SLP tree root if they have no
5336 uses. */
5337 if (is_a <loop_vec_info> (vinfo))
5338 vect_remove_slp_scalar_calls (vinfo, root);
5339
5340 /* Remove vectorized stores original scalar stmts. */
5341 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5342 {
5343 if (!STMT_VINFO_DATA_REF (store_info)
5344 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5345 break;
5346
5347 store_info = vect_orig_stmt (store_info);
5348 /* Free the attached stmt_vec_info and remove the stmt. */
5349 vinfo->remove_stmt (store_info);
5350 }
5351 }
5352 }