tree-optimization/98235 - limit SLP discovery
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 static void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 static slp_tree
180 vect_create_new_slp_node (slp_tree node,
181 vec<stmt_vec_info> scalar_stmts, unsigned nops)
182 {
183 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
187 SLP_TREE_LANES (node) = scalar_stmts.length ();
188 return node;
189 }
190
191 /* Create an SLP node for SCALAR_STMTS. */
192
193 static slp_tree
194 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
195 {
196 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
197 }
198
199 /* Create an SLP node for OPS. */
200
201 static slp_tree
202 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
203 {
204 SLP_TREE_SCALAR_OPS (node) = ops;
205 SLP_TREE_DEF_TYPE (node) = vect_external_def;
206 SLP_TREE_LANES (node) = ops.length ();
207 return node;
208 }
209
210 /* Create an SLP node for OPS. */
211
212 static slp_tree
213 vect_create_new_slp_node (vec<tree> ops)
214 {
215 return vect_create_new_slp_node (new _slp_tree, ops);
216 }
217
218
219 /* This structure is used in creation of an SLP tree. Each instance
220 corresponds to the same operand in a group of scalar stmts in an SLP
221 node. */
222 typedef struct _slp_oprnd_info
223 {
224 /* Def-stmts for the operands. */
225 vec<stmt_vec_info> def_stmts;
226 /* Operands. */
227 vec<tree> ops;
228 /* Information about the first statement, its vector def-type, type, the
229 operand itself in case it's constant, and an indication if it's a pattern
230 stmt. */
231 tree first_op_type;
232 enum vect_def_type first_dt;
233 bool any_pattern;
234 } *slp_oprnd_info;
235
236
237 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
238 operand. */
239 static vec<slp_oprnd_info>
240 vect_create_oprnd_info (int nops, int group_size)
241 {
242 int i;
243 slp_oprnd_info oprnd_info;
244 vec<slp_oprnd_info> oprnds_info;
245
246 oprnds_info.create (nops);
247 for (i = 0; i < nops; i++)
248 {
249 oprnd_info = XNEW (struct _slp_oprnd_info);
250 oprnd_info->def_stmts.create (group_size);
251 oprnd_info->ops.create (group_size);
252 oprnd_info->first_dt = vect_uninitialized_def;
253 oprnd_info->first_op_type = NULL_TREE;
254 oprnd_info->any_pattern = false;
255 oprnds_info.quick_push (oprnd_info);
256 }
257
258 return oprnds_info;
259 }
260
261
262 /* Free operands info. */
263
264 static void
265 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
266 {
267 int i;
268 slp_oprnd_info oprnd_info;
269
270 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
271 {
272 oprnd_info->def_stmts.release ();
273 oprnd_info->ops.release ();
274 XDELETE (oprnd_info);
275 }
276
277 oprnds_info.release ();
278 }
279
280
281 /* Return true if STMTS contains a pattern statement. */
282
283 static bool
284 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
285 {
286 stmt_vec_info stmt_info;
287 unsigned int i;
288 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
289 if (is_pattern_stmt_p (stmt_info))
290 return true;
291 return false;
292 }
293
294 /* Return true when all lanes in the external or constant NODE have
295 the same value. */
296
297 static bool
298 vect_slp_tree_uniform_p (slp_tree node)
299 {
300 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
301 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
302
303 /* Pre-exsting vectors. */
304 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
305 return false;
306
307 unsigned i;
308 tree op, first = NULL_TREE;
309 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
310 if (!first)
311 first = op;
312 else if (!operand_equal_p (first, op, 0))
313 return false;
314
315 return true;
316 }
317
318 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
319 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
320 of the chain. */
321
322 int
323 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
324 stmt_vec_info first_stmt_info)
325 {
326 stmt_vec_info next_stmt_info = first_stmt_info;
327 int result = 0;
328
329 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
330 return -1;
331
332 do
333 {
334 if (next_stmt_info == stmt_info)
335 return result;
336 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
337 if (next_stmt_info)
338 result += DR_GROUP_GAP (next_stmt_info);
339 }
340 while (next_stmt_info);
341
342 return -1;
343 }
344
345 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
346 using the method implemented by duplicate_and_interleave. Return true
347 if so, returning the number of intermediate vectors in *NVECTORS_OUT
348 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
349 (if nonnull). */
350
351 bool
352 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
353 tree elt_type, unsigned int *nvectors_out,
354 tree *vector_type_out,
355 tree *permutes)
356 {
357 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
358 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
359 return false;
360
361 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
362 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
363 unsigned int nvectors = 1;
364 for (;;)
365 {
366 scalar_int_mode int_mode;
367 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
368 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
369 {
370 /* Get the natural vector type for this SLP group size. */
371 tree int_type = build_nonstandard_integer_type
372 (GET_MODE_BITSIZE (int_mode), 1);
373 tree vector_type
374 = get_vectype_for_scalar_type (vinfo, int_type, count);
375 if (vector_type
376 && VECTOR_MODE_P (TYPE_MODE (vector_type))
377 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
378 GET_MODE_SIZE (base_vector_mode)))
379 {
380 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
381 together into elements of type INT_TYPE and using the result
382 to build NVECTORS vectors. */
383 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
384 vec_perm_builder sel1 (nelts, 2, 3);
385 vec_perm_builder sel2 (nelts, 2, 3);
386 poly_int64 half_nelts = exact_div (nelts, 2);
387 for (unsigned int i = 0; i < 3; ++i)
388 {
389 sel1.quick_push (i);
390 sel1.quick_push (i + nelts);
391 sel2.quick_push (half_nelts + i);
392 sel2.quick_push (half_nelts + i + nelts);
393 }
394 vec_perm_indices indices1 (sel1, 2, nelts);
395 vec_perm_indices indices2 (sel2, 2, nelts);
396 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
397 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
398 {
399 if (nvectors_out)
400 *nvectors_out = nvectors;
401 if (vector_type_out)
402 *vector_type_out = vector_type;
403 if (permutes)
404 {
405 permutes[0] = vect_gen_perm_mask_checked (vector_type,
406 indices1);
407 permutes[1] = vect_gen_perm_mask_checked (vector_type,
408 indices2);
409 }
410 return true;
411 }
412 }
413 }
414 if (!multiple_p (elt_bytes, 2, &elt_bytes))
415 return false;
416 nvectors *= 2;
417 }
418 }
419
420 /* Return true if DTA and DTB match. */
421
422 static bool
423 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
424 {
425 return (dta == dtb
426 || ((dta == vect_external_def || dta == vect_constant_def)
427 && (dtb == vect_external_def || dtb == vect_constant_def)));
428 }
429
430 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
431 they are of a valid type and that they match the defs of the first stmt of
432 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
433 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
434 indicates swap is required for cond_expr stmts. Specifically, *SWAP
435 is 1 if STMT is cond and operands of comparison need to be swapped;
436 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
437 If there is any operand swap in this function, *SWAP is set to non-zero
438 value.
439 If there was a fatal error return -1; if the error could be corrected by
440 swapping operands of father node of this one, return 1; if everything is
441 ok return 0. */
442 static int
443 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
444 bool *skip_args,
445 vec<stmt_vec_info> stmts, unsigned stmt_num,
446 vec<slp_oprnd_info> *oprnds_info)
447 {
448 stmt_vec_info stmt_info = stmts[stmt_num];
449 tree oprnd;
450 unsigned int i, number_of_oprnds;
451 enum vect_def_type dt = vect_uninitialized_def;
452 slp_oprnd_info oprnd_info;
453 int first_op_idx = 1;
454 unsigned int commutative_op = -1U;
455 bool first_op_cond = false;
456 bool first = stmt_num == 0;
457
458 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
459 {
460 number_of_oprnds = gimple_call_num_args (stmt);
461 first_op_idx = 3;
462 if (gimple_call_internal_p (stmt))
463 {
464 internal_fn ifn = gimple_call_internal_fn (stmt);
465 commutative_op = first_commutative_argument (ifn);
466
467 /* Masked load, only look at mask. */
468 if (ifn == IFN_MASK_LOAD)
469 {
470 number_of_oprnds = 1;
471 /* Mask operand index. */
472 first_op_idx = 5;
473 }
474 }
475 }
476 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
477 {
478 enum tree_code code = gimple_assign_rhs_code (stmt);
479 number_of_oprnds = gimple_num_ops (stmt) - 1;
480 /* Swap can only be done for cond_expr if asked to, otherwise we
481 could result in different comparison code to the first stmt. */
482 if (code == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
484 {
485 first_op_cond = true;
486 number_of_oprnds++;
487 }
488 else
489 commutative_op = commutative_tree_code (code) ? 0U : -1U;
490 }
491 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
492 number_of_oprnds = gimple_phi_num_args (stmt);
493 else
494 return -1;
495
496 bool swapped = (swap != 0);
497 bool backedge = false;
498 gcc_assert (!swapped || first_op_cond);
499 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
500 for (i = 0; i < number_of_oprnds; i++)
501 {
502 if (first_op_cond)
503 {
504 /* Map indicating how operands of cond_expr should be swapped. */
505 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
506 int *map = maps[swap];
507
508 if (i < 2)
509 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
510 first_op_idx), map[i]);
511 else
512 oprnd = gimple_op (stmt_info->stmt, map[i]);
513 }
514 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
515 {
516 oprnd = gimple_phi_arg_def (stmt, i);
517 backedge = dominated_by_p (CDI_DOMINATORS,
518 gimple_phi_arg_edge (stmt, i)->src,
519 gimple_bb (stmt_info->stmt));
520 }
521 else
522 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
523 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
524 oprnd = TREE_OPERAND (oprnd, 0);
525
526 oprnd_info = (*oprnds_info)[i];
527
528 stmt_vec_info def_stmt_info;
529 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
530 {
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
533 "Build SLP failed: can't analyze def for %T\n",
534 oprnd);
535
536 return -1;
537 }
538
539 if (skip_args[i])
540 {
541 oprnd_info->def_stmts.quick_push (NULL);
542 oprnd_info->ops.quick_push (NULL_TREE);
543 oprnd_info->first_dt = vect_uninitialized_def;
544 continue;
545 }
546
547 oprnd_info->def_stmts.quick_push (def_stmt_info);
548 oprnd_info->ops.quick_push (oprnd);
549
550 if (def_stmt_info
551 && is_pattern_stmt_p (def_stmt_info))
552 {
553 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
554 != def_stmt_info)
555 oprnd_info->any_pattern = true;
556 else
557 /* If we promote this to external use the original stmt def. */
558 oprnd_info->ops.last ()
559 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
560 }
561
562 /* If there's a extern def on a backedge make sure we can
563 code-generate at the region start.
564 ??? This is another case that could be fixed by adjusting
565 how we split the function but at the moment we'd have conflicting
566 goals there. */
567 if (backedge
568 && dts[i] == vect_external_def
569 && is_a <bb_vec_info> (vinfo)
570 && TREE_CODE (oprnd) == SSA_NAME
571 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
572 && !dominated_by_p (CDI_DOMINATORS,
573 as_a <bb_vec_info> (vinfo)->bbs[0],
574 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
575 {
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578 "Build SLP failed: extern def %T only defined "
579 "on backedge\n", oprnd);
580 return -1;
581 }
582
583 if (first)
584 {
585 tree type = TREE_TYPE (oprnd);
586 dt = dts[i];
587 if ((dt == vect_constant_def
588 || dt == vect_external_def)
589 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
590 && (TREE_CODE (type) == BOOLEAN_TYPE
591 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
592 type)))
593 {
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
596 "Build SLP failed: invalid type of def "
597 "for variable-length SLP %T\n", oprnd);
598 return -1;
599 }
600
601 /* For the swapping logic below force vect_reduction_def
602 for the reduction op in a SLP reduction group. */
603 if (!STMT_VINFO_DATA_REF (stmt_info)
604 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
605 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
606 && def_stmt_info)
607 dts[i] = dt = vect_reduction_def;
608
609 /* Check the types of the definition. */
610 switch (dt)
611 {
612 case vect_external_def:
613 case vect_constant_def:
614 case vect_internal_def:
615 case vect_reduction_def:
616 case vect_induction_def:
617 case vect_nested_cycle:
618 break;
619
620 default:
621 /* FORNOW: Not supported. */
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
624 "Build SLP failed: illegal type of def %T\n",
625 oprnd);
626 return -1;
627 }
628
629 oprnd_info->first_dt = dt;
630 oprnd_info->first_op_type = type;
631 }
632 }
633 if (first)
634 return 0;
635
636 /* Now match the operand definition types to that of the first stmt. */
637 for (i = 0; i < number_of_oprnds;)
638 {
639 if (skip_args[i])
640 {
641 ++i;
642 continue;
643 }
644
645 oprnd_info = (*oprnds_info)[i];
646 dt = dts[i];
647 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
648 oprnd = oprnd_info->ops[stmt_num];
649 tree type = TREE_TYPE (oprnd);
650
651 if (!types_compatible_p (oprnd_info->first_op_type, type))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
655 "Build SLP failed: different operand types\n");
656 return 1;
657 }
658
659 /* Not first stmt of the group, check that the def-stmt/s match
660 the def-stmt/s of the first stmt. Allow different definition
661 types for reduction chains: the first stmt must be a
662 vect_reduction_def (a phi node), and the rest
663 end in the reduction chain. */
664 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
665 && !(oprnd_info->first_dt == vect_reduction_def
666 && !STMT_VINFO_DATA_REF (stmt_info)
667 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
668 && def_stmt_info
669 && !STMT_VINFO_DATA_REF (def_stmt_info)
670 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
671 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
672 || (!STMT_VINFO_DATA_REF (stmt_info)
673 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
674 && ((!def_stmt_info
675 || STMT_VINFO_DATA_REF (def_stmt_info)
676 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
677 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
678 != (oprnd_info->first_dt != vect_reduction_def))))
679 {
680 /* Try swapping operands if we got a mismatch. For BB
681 vectorization only in case it will clearly improve things. */
682 if (i == commutative_op && !swapped
683 && (!is_a <bb_vec_info> (vinfo)
684 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
685 dts[i+1])
686 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
687 || vect_def_types_match
688 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
689 {
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE, vect_location,
692 "trying swapped operands\n");
693 std::swap (dts[i], dts[i+1]);
694 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
695 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
696 std::swap ((*oprnds_info)[i]->ops[stmt_num],
697 (*oprnds_info)[i+1]->ops[stmt_num]);
698 swapped = true;
699 continue;
700 }
701
702 if (is_a <bb_vec_info> (vinfo)
703 && !oprnd_info->any_pattern)
704 {
705 /* Now for commutative ops we should see whether we can
706 make the other operand matching. */
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
709 "treating operand as external\n");
710 oprnd_info->first_dt = dt = vect_external_def;
711 }
712 else
713 {
714 if (dump_enabled_p ())
715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
716 "Build SLP failed: different types\n");
717 return 1;
718 }
719 }
720
721 /* Make sure to demote the overall operand to external. */
722 if (dt == vect_external_def)
723 oprnd_info->first_dt = vect_external_def;
724 /* For a SLP reduction chain we want to duplicate the reduction to
725 each of the chain members. That gets us a sane SLP graph (still
726 the stmts are not 100% correct wrt the initial values). */
727 else if ((dt == vect_internal_def
728 || dt == vect_reduction_def)
729 && oprnd_info->first_dt == vect_reduction_def
730 && !STMT_VINFO_DATA_REF (stmt_info)
731 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
732 && !STMT_VINFO_DATA_REF (def_stmt_info)
733 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
734 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
735 {
736 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
737 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
738 }
739
740 ++i;
741 }
742
743 /* Swap operands. */
744 if (swapped)
745 {
746 if (dump_enabled_p ())
747 dump_printf_loc (MSG_NOTE, vect_location,
748 "swapped operands to match def types in %G",
749 stmt_info->stmt);
750 }
751
752 return 0;
753 }
754
755 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
756 Return true if we can, meaning that this choice doesn't conflict with
757 existing SLP nodes that use STMT_INFO. */
758
759 bool
760 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
761 {
762 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
763 if (old_vectype)
764 return useless_type_conversion_p (vectype, old_vectype);
765
766 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
767 {
768 /* We maintain the invariant that if any statement in the group is
769 used, all other members of the group have the same vector type. */
770 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
771 stmt_vec_info member_info = first_info;
772 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
773 if (is_pattern_stmt_p (member_info)
774 && !useless_type_conversion_p (vectype,
775 STMT_VINFO_VECTYPE (member_info)))
776 break;
777
778 if (!member_info)
779 {
780 for (member_info = first_info; member_info;
781 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
782 STMT_VINFO_VECTYPE (member_info) = vectype;
783 return true;
784 }
785 }
786 else if (!is_pattern_stmt_p (stmt_info))
787 {
788 STMT_VINFO_VECTYPE (stmt_info) = vectype;
789 return true;
790 }
791
792 if (dump_enabled_p ())
793 {
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "Build SLP failed: incompatible vector"
796 " types for: %G", stmt_info->stmt);
797 dump_printf_loc (MSG_NOTE, vect_location,
798 " old vector type: %T\n", old_vectype);
799 dump_printf_loc (MSG_NOTE, vect_location,
800 " new vector type: %T\n", vectype);
801 }
802 return false;
803 }
804
805 /* Return true if call statements CALL1 and CALL2 are similar enough
806 to be combined into the same SLP group. */
807
808 static bool
809 compatible_calls_p (gcall *call1, gcall *call2)
810 {
811 unsigned int nargs = gimple_call_num_args (call1);
812 if (nargs != gimple_call_num_args (call2))
813 return false;
814
815 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
816 return false;
817
818 if (gimple_call_internal_p (call1))
819 {
820 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
821 TREE_TYPE (gimple_call_lhs (call2))))
822 return false;
823 for (unsigned int i = 0; i < nargs; ++i)
824 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
825 TREE_TYPE (gimple_call_arg (call2, i))))
826 return false;
827 }
828 else
829 {
830 if (!operand_equal_p (gimple_call_fn (call1),
831 gimple_call_fn (call2), 0))
832 return false;
833
834 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
835 return false;
836 }
837 return true;
838 }
839
840 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
841 caller's attempt to find the vector type in STMT_INFO with the narrowest
842 element type. Return true if VECTYPE is nonnull and if it is valid
843 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
844 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
845 vect_build_slp_tree. */
846
847 static bool
848 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
849 unsigned int group_size,
850 tree vectype, poly_uint64 *max_nunits)
851 {
852 if (!vectype)
853 {
854 if (dump_enabled_p ())
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
856 "Build SLP failed: unsupported data-type in %G\n",
857 stmt_info->stmt);
858 /* Fatal mismatch. */
859 return false;
860 }
861
862 /* If populating the vector type requires unrolling then fail
863 before adjusting *max_nunits for basic-block vectorization. */
864 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
865 unsigned HOST_WIDE_INT const_nunits;
866 if (is_a <bb_vec_info> (vinfo)
867 && (!nunits.is_constant (&const_nunits)
868 || const_nunits > group_size))
869 {
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
872 "Build SLP failed: unrolling required "
873 "in basic block SLP\n");
874 /* Fatal mismatch. */
875 return false;
876 }
877
878 /* In case of multiple types we need to detect the smallest type. */
879 vect_update_max_nunits (max_nunits, vectype);
880 return true;
881 }
882
883 /* Verify if the scalar stmts STMTS are isomorphic, require data
884 permutation or are of unsupported types of operation. Return
885 true if they are, otherwise return false and indicate in *MATCHES
886 which stmts are not isomorphic to the first one. If MATCHES[0]
887 is false then this indicates the comparison could not be
888 carried out or the stmts will never be vectorized by SLP.
889
890 Note COND_EXPR is possibly isomorphic to another one after swapping its
891 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
892 the first stmt by swapping the two operands of comparison; set SWAP[i]
893 to 2 if stmt I is isormorphic to the first stmt by inverting the code
894 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
895 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
896
897 static bool
898 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
899 vec<stmt_vec_info> stmts, unsigned int group_size,
900 poly_uint64 *max_nunits, bool *matches,
901 bool *two_operators, tree *node_vectype)
902 {
903 unsigned int i;
904 stmt_vec_info first_stmt_info = stmts[0];
905 enum tree_code first_stmt_code = ERROR_MARK;
906 enum tree_code alt_stmt_code = ERROR_MARK;
907 enum tree_code rhs_code = ERROR_MARK;
908 enum tree_code first_cond_code = ERROR_MARK;
909 tree lhs;
910 bool need_same_oprnds = false;
911 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
912 optab optab;
913 int icode;
914 machine_mode optab_op2_mode;
915 machine_mode vec_mode;
916 stmt_vec_info first_load = NULL, prev_first_load = NULL;
917 bool first_stmt_load_p = false, load_p = false;
918 bool first_stmt_phi_p = false, phi_p = false;
919
920 /* For every stmt in NODE find its def stmt/s. */
921 stmt_vec_info stmt_info;
922 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
923 {
924 gimple *stmt = stmt_info->stmt;
925 swap[i] = 0;
926 matches[i] = false;
927
928 if (dump_enabled_p ())
929 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
930
931 /* Fail to vectorize statements marked as unvectorizable, throw
932 or are volatile. */
933 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
934 || stmt_can_throw_internal (cfun, stmt)
935 || gimple_has_volatile_ops (stmt))
936 {
937 if (dump_enabled_p ())
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "Build SLP failed: unvectorizable statement %G",
940 stmt);
941 /* ??? For BB vectorization we want to commutate operands in a way
942 to shuffle all unvectorizable defs into one operand and have
943 the other still vectorized. The following doesn't reliably
944 work for this though but it's the easiest we can do here. */
945 if (is_a <bb_vec_info> (vinfo) && i != 0)
946 continue;
947 /* Fatal mismatch. */
948 matches[0] = false;
949 return false;
950 }
951
952 lhs = gimple_get_lhs (stmt);
953 if (lhs == NULL_TREE)
954 {
955 if (dump_enabled_p ())
956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957 "Build SLP failed: not GIMPLE_ASSIGN nor "
958 "GIMPLE_CALL %G", stmt);
959 if (is_a <bb_vec_info> (vinfo) && i != 0)
960 continue;
961 /* Fatal mismatch. */
962 matches[0] = false;
963 return false;
964 }
965
966 tree nunits_vectype;
967 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
968 &nunits_vectype, group_size)
969 || (nunits_vectype
970 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
971 nunits_vectype, max_nunits)))
972 {
973 if (is_a <bb_vec_info> (vinfo) && i != 0)
974 continue;
975 /* Fatal mismatch. */
976 matches[0] = false;
977 return false;
978 }
979
980 gcc_assert (vectype);
981
982 gcall *call_stmt = dyn_cast <gcall *> (stmt);
983 if (call_stmt)
984 {
985 rhs_code = CALL_EXPR;
986
987 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
988 load_p = true;
989 else if ((gimple_call_internal_p (call_stmt)
990 && (!vectorizable_internal_fn_p
991 (gimple_call_internal_fn (call_stmt))))
992 || gimple_call_tail_p (call_stmt)
993 || gimple_call_noreturn_p (call_stmt)
994 || !gimple_call_nothrow_p (call_stmt)
995 || gimple_call_chain (call_stmt))
996 {
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
999 "Build SLP failed: unsupported call type %G",
1000 call_stmt);
1001 if (is_a <bb_vec_info> (vinfo) && i != 0)
1002 continue;
1003 /* Fatal mismatch. */
1004 matches[0] = false;
1005 return false;
1006 }
1007 }
1008 else if (gimple_code (stmt) == GIMPLE_PHI)
1009 {
1010 rhs_code = ERROR_MARK;
1011 phi_p = true;
1012 }
1013 else
1014 {
1015 rhs_code = gimple_assign_rhs_code (stmt);
1016 load_p = gimple_vuse (stmt);
1017 }
1018
1019 /* Check the operation. */
1020 if (i == 0)
1021 {
1022 *node_vectype = vectype;
1023 first_stmt_code = rhs_code;
1024 first_stmt_load_p = load_p;
1025 first_stmt_phi_p = phi_p;
1026
1027 /* Shift arguments should be equal in all the packed stmts for a
1028 vector shift with scalar shift operand. */
1029 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1030 || rhs_code == LROTATE_EXPR
1031 || rhs_code == RROTATE_EXPR)
1032 {
1033 vec_mode = TYPE_MODE (vectype);
1034
1035 /* First see if we have a vector/vector shift. */
1036 optab = optab_for_tree_code (rhs_code, vectype,
1037 optab_vector);
1038
1039 if (!optab
1040 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1041 {
1042 /* No vector/vector shift, try for a vector/scalar shift. */
1043 optab = optab_for_tree_code (rhs_code, vectype,
1044 optab_scalar);
1045
1046 if (!optab)
1047 {
1048 if (dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1050 "Build SLP failed: no optab.\n");
1051 if (is_a <bb_vec_info> (vinfo) && i != 0)
1052 continue;
1053 /* Fatal mismatch. */
1054 matches[0] = false;
1055 return false;
1056 }
1057 icode = (int) optab_handler (optab, vec_mode);
1058 if (icode == CODE_FOR_nothing)
1059 {
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "Build SLP failed: "
1063 "op not supported by target.\n");
1064 if (is_a <bb_vec_info> (vinfo) && i != 0)
1065 continue;
1066 /* Fatal mismatch. */
1067 matches[0] = false;
1068 return false;
1069 }
1070 optab_op2_mode = insn_data[icode].operand[2].mode;
1071 if (!VECTOR_MODE_P (optab_op2_mode))
1072 {
1073 need_same_oprnds = true;
1074 first_op1 = gimple_assign_rhs2 (stmt);
1075 }
1076 }
1077 }
1078 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1079 {
1080 need_same_oprnds = true;
1081 first_op1 = gimple_assign_rhs2 (stmt);
1082 }
1083 else if (!load_p
1084 && rhs_code == BIT_FIELD_REF)
1085 {
1086 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1087 if (TREE_CODE (vec) != SSA_NAME
1088 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1089 {
1090 if (is_a <bb_vec_info> (vinfo) && i != 0)
1091 continue;
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "Build SLP failed: "
1095 "BIT_FIELD_REF not supported\n");
1096 /* Fatal mismatch. */
1097 matches[0] = false;
1098 return false;
1099 }
1100 }
1101 else if (call_stmt
1102 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1103 {
1104 need_same_oprnds = true;
1105 first_op1 = gimple_call_arg (call_stmt, 1);
1106 }
1107 }
1108 else
1109 {
1110 if (first_stmt_code != rhs_code
1111 && alt_stmt_code == ERROR_MARK)
1112 alt_stmt_code = rhs_code;
1113 if ((first_stmt_code != rhs_code
1114 && (first_stmt_code != IMAGPART_EXPR
1115 || rhs_code != REALPART_EXPR)
1116 && (first_stmt_code != REALPART_EXPR
1117 || rhs_code != IMAGPART_EXPR)
1118 /* Handle mismatches in plus/minus by computing both
1119 and merging the results. */
1120 && !((first_stmt_code == PLUS_EXPR
1121 || first_stmt_code == MINUS_EXPR)
1122 && (alt_stmt_code == PLUS_EXPR
1123 || alt_stmt_code == MINUS_EXPR)
1124 && rhs_code == alt_stmt_code)
1125 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1126 && (first_stmt_code == ARRAY_REF
1127 || first_stmt_code == BIT_FIELD_REF
1128 || first_stmt_code == INDIRECT_REF
1129 || first_stmt_code == COMPONENT_REF
1130 || first_stmt_code == MEM_REF)))
1131 || first_stmt_load_p != load_p
1132 || first_stmt_phi_p != phi_p)
1133 {
1134 if (dump_enabled_p ())
1135 {
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137 "Build SLP failed: different operation "
1138 "in stmt %G", stmt);
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "original stmt %G", first_stmt_info->stmt);
1141 }
1142 /* Mismatch. */
1143 continue;
1144 }
1145
1146 if (need_same_oprnds)
1147 {
1148 tree other_op1 = (call_stmt
1149 ? gimple_call_arg (call_stmt, 1)
1150 : gimple_assign_rhs2 (stmt));
1151 if (!operand_equal_p (first_op1, other_op1, 0))
1152 {
1153 if (dump_enabled_p ())
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155 "Build SLP failed: different shift "
1156 "arguments in %G", stmt);
1157 /* Mismatch. */
1158 continue;
1159 }
1160 }
1161 if (!load_p
1162 && first_stmt_code == BIT_FIELD_REF
1163 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1164 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1165 {
1166 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1168 "Build SLP failed: different BIT_FIELD_REF "
1169 "arguments in %G", stmt);
1170 /* Mismatch. */
1171 continue;
1172 }
1173
1174 if (!load_p && rhs_code == CALL_EXPR)
1175 {
1176 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1177 as_a <gcall *> (stmt)))
1178 {
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Build SLP failed: different calls in %G",
1182 stmt);
1183 /* Mismatch. */
1184 continue;
1185 }
1186 }
1187
1188 if (phi_p
1189 && (gimple_bb (first_stmt_info->stmt)
1190 != gimple_bb (stmt_info->stmt)))
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1194 "Build SLP failed: different BB for PHI "
1195 "in %G", stmt);
1196 /* Mismatch. */
1197 continue;
1198 }
1199
1200 if (!types_compatible_p (vectype, *node_vectype))
1201 {
1202 if (dump_enabled_p ())
1203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204 "Build SLP failed: different vector type "
1205 "in %G", stmt);
1206 /* Mismatch. */
1207 continue;
1208 }
1209 }
1210
1211 /* Grouped store or load. */
1212 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1213 {
1214 if (REFERENCE_CLASS_P (lhs))
1215 {
1216 /* Store. */
1217 ;
1218 }
1219 else
1220 {
1221 /* Load. */
1222 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1223 if (prev_first_load)
1224 {
1225 /* Check that there are no loads from different interleaving
1226 chains in the same node. */
1227 if (prev_first_load != first_load)
1228 {
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1231 vect_location,
1232 "Build SLP failed: different "
1233 "interleaving chains in one node %G",
1234 stmt);
1235 /* Mismatch. */
1236 continue;
1237 }
1238 }
1239 else
1240 prev_first_load = first_load;
1241 }
1242 } /* Grouped access. */
1243 else
1244 {
1245 if (load_p)
1246 {
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "Build SLP failed: not grouped load %G", stmt);
1251
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 continue;
1255 /* Fatal mismatch. */
1256 matches[0] = false;
1257 return false;
1258 }
1259
1260 /* Not memory operation. */
1261 if (!phi_p
1262 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1263 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1264 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1265 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1266 && rhs_code != VIEW_CONVERT_EXPR
1267 && rhs_code != CALL_EXPR
1268 && rhs_code != BIT_FIELD_REF)
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: operation unsupported %G",
1273 stmt);
1274 if (is_a <bb_vec_info> (vinfo) && i != 0)
1275 continue;
1276 /* Fatal mismatch. */
1277 matches[0] = false;
1278 return false;
1279 }
1280
1281 if (rhs_code == COND_EXPR)
1282 {
1283 tree cond_expr = gimple_assign_rhs1 (stmt);
1284 enum tree_code cond_code = TREE_CODE (cond_expr);
1285 enum tree_code swap_code = ERROR_MARK;
1286 enum tree_code invert_code = ERROR_MARK;
1287
1288 if (i == 0)
1289 first_cond_code = TREE_CODE (cond_expr);
1290 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1291 {
1292 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1293 swap_code = swap_tree_comparison (cond_code);
1294 invert_code = invert_tree_comparison (cond_code, honor_nans);
1295 }
1296
1297 if (first_cond_code == cond_code)
1298 ;
1299 /* Isomorphic can be achieved by swapping. */
1300 else if (first_cond_code == swap_code)
1301 swap[i] = 1;
1302 /* Isomorphic can be achieved by inverting. */
1303 else if (first_cond_code == invert_code)
1304 swap[i] = 2;
1305 else
1306 {
1307 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1309 "Build SLP failed: different"
1310 " operation %G", stmt);
1311 /* Mismatch. */
1312 continue;
1313 }
1314 }
1315 }
1316
1317 matches[i] = true;
1318 }
1319
1320 for (i = 0; i < group_size; ++i)
1321 if (!matches[i])
1322 return false;
1323
1324 /* If we allowed a two-operation SLP node verify the target can cope
1325 with the permute we are going to use. */
1326 if (alt_stmt_code != ERROR_MARK
1327 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1328 {
1329 *two_operators = true;
1330 }
1331
1332 return true;
1333 }
1334
1335 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1336 Note we never remove apart from at destruction time so we do not
1337 need a special value for deleted that differs from empty. */
1338 struct bst_traits
1339 {
1340 typedef vec <stmt_vec_info> value_type;
1341 typedef vec <stmt_vec_info> compare_type;
1342 static inline hashval_t hash (value_type);
1343 static inline bool equal (value_type existing, value_type candidate);
1344 static inline bool is_empty (value_type x) { return !x.exists (); }
1345 static inline bool is_deleted (value_type x) { return !x.exists (); }
1346 static const bool empty_zero_p = true;
1347 static inline void mark_empty (value_type &x) { x.release (); }
1348 static inline void mark_deleted (value_type &x) { x.release (); }
1349 static inline void remove (value_type &x) { x.release (); }
1350 };
1351 inline hashval_t
1352 bst_traits::hash (value_type x)
1353 {
1354 inchash::hash h;
1355 for (unsigned i = 0; i < x.length (); ++i)
1356 h.add_int (gimple_uid (x[i]->stmt));
1357 return h.end ();
1358 }
1359 inline bool
1360 bst_traits::equal (value_type existing, value_type candidate)
1361 {
1362 if (existing.length () != candidate.length ())
1363 return false;
1364 for (unsigned i = 0; i < existing.length (); ++i)
1365 if (existing[i] != candidate[i])
1366 return false;
1367 return true;
1368 }
1369
1370 typedef hash_map <vec <gimple *>, slp_tree,
1371 simple_hashmap_traits <bst_traits, slp_tree> >
1372 scalar_stmts_to_slp_tree_map_t;
1373
1374 static slp_tree
1375 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1376 vec<stmt_vec_info> stmts, unsigned int group_size,
1377 poly_uint64 *max_nunits,
1378 bool *matches, unsigned *limit, unsigned *tree_size,
1379 scalar_stmts_to_slp_tree_map_t *bst_map);
1380
1381 static slp_tree
1382 vect_build_slp_tree (vec_info *vinfo,
1383 vec<stmt_vec_info> stmts, unsigned int group_size,
1384 poly_uint64 *max_nunits,
1385 bool *matches, unsigned *limit, unsigned *tree_size,
1386 scalar_stmts_to_slp_tree_map_t *bst_map)
1387 {
1388 if (slp_tree *leader = bst_map->get (stmts))
1389 {
1390 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1392 *leader ? "" : "failed ", *leader);
1393 if (*leader)
1394 {
1395 SLP_TREE_REF_COUNT (*leader)++;
1396 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1397 }
1398 return *leader;
1399 }
1400
1401 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1402 so we can pick up backedge destinations during discovery. */
1403 slp_tree res = new _slp_tree;
1404 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1405 SLP_TREE_SCALAR_STMTS (res) = stmts;
1406 bst_map->put (stmts.copy (), res);
1407
1408 if (*limit == 0)
1409 {
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_NOTE, vect_location,
1412 "SLP discovery limit exceeded\n");
1413 bool existed_p = bst_map->put (stmts, NULL);
1414 gcc_assert (existed_p);
1415 /* Mark the node invalid so we can detect those when still in use
1416 as backedge destinations. */
1417 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1418 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1419 vect_free_slp_tree (res);
1420 return NULL;
1421 }
1422 --*limit;
1423
1424 poly_uint64 this_max_nunits = 1;
1425 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1426 &this_max_nunits,
1427 matches, limit, tree_size, bst_map);
1428 if (!res_)
1429 {
1430 bool existed_p = bst_map->put (stmts, NULL);
1431 gcc_assert (existed_p);
1432 /* Mark the node invalid so we can detect those when still in use
1433 as backedge destinations. */
1434 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1435 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1436 vect_free_slp_tree (res);
1437 }
1438 else
1439 {
1440 gcc_assert (res_ == res);
1441 res->max_nunits = this_max_nunits;
1442 vect_update_max_nunits (max_nunits, this_max_nunits);
1443 /* Keep a reference for the bst_map use. */
1444 SLP_TREE_REF_COUNT (res)++;
1445 }
1446 return res_;
1447 }
1448
1449 /* Recursively build an SLP tree starting from NODE.
1450 Fail (and return a value not equal to zero) if def-stmts are not
1451 isomorphic, require data permutation or are of unsupported types of
1452 operation. Otherwise, return 0.
1453 The value returned is the depth in the SLP tree where a mismatch
1454 was found. */
1455
1456 static slp_tree
1457 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1458 vec<stmt_vec_info> stmts, unsigned int group_size,
1459 poly_uint64 *max_nunits,
1460 bool *matches, unsigned *limit, unsigned *tree_size,
1461 scalar_stmts_to_slp_tree_map_t *bst_map)
1462 {
1463 unsigned nops, i, this_tree_size = 0;
1464 poly_uint64 this_max_nunits = *max_nunits;
1465
1466 matches[0] = false;
1467
1468 stmt_vec_info stmt_info = stmts[0];
1469 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1470 nops = gimple_call_num_args (stmt);
1471 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1472 {
1473 nops = gimple_num_ops (stmt) - 1;
1474 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1475 nops++;
1476 }
1477 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1478 nops = gimple_phi_num_args (phi);
1479 else
1480 return NULL;
1481
1482 /* If the SLP node is a PHI (induction or reduction), terminate
1483 the recursion. */
1484 bool *skip_args = XALLOCAVEC (bool, nops);
1485 memset (skip_args, 0, sizeof (bool) * nops);
1486 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1487 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1488 {
1489 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1490 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1491 group_size);
1492 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1493 max_nunits))
1494 return NULL;
1495
1496 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1497 if (def_type == vect_induction_def)
1498 {
1499 /* Induction PHIs are not cycles but walk the initial
1500 value. Only for inner loops through, for outer loops
1501 we need to pick up the value from the actual PHIs
1502 to more easily support peeling and epilogue vectorization. */
1503 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1504 if (!nested_in_vect_loop_p (loop, stmt_info))
1505 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1506 else
1507 loop = loop->inner;
1508 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1509 }
1510 else if (def_type == vect_reduction_def
1511 || def_type == vect_double_reduction_def
1512 || def_type == vect_nested_cycle)
1513 {
1514 /* Else def types have to match. */
1515 stmt_vec_info other_info;
1516 bool all_same = true;
1517 FOR_EACH_VEC_ELT (stmts, i, other_info)
1518 {
1519 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1520 return NULL;
1521 if (other_info != stmt_info)
1522 all_same = false;
1523 }
1524 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1525 /* Reduction initial values are not explicitely represented. */
1526 if (!nested_in_vect_loop_p (loop, stmt_info))
1527 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1528 /* Reduction chain backedge defs are filled manually.
1529 ??? Need a better way to identify a SLP reduction chain PHI.
1530 Or a better overall way to SLP match those. */
1531 if (all_same && def_type == vect_reduction_def)
1532 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1533 }
1534 else if (def_type != vect_internal_def)
1535 return NULL;
1536 }
1537
1538
1539 bool two_operators = false;
1540 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1541 tree vectype = NULL_TREE;
1542 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1543 &this_max_nunits, matches, &two_operators,
1544 &vectype))
1545 return NULL;
1546
1547 /* If the SLP node is a load, terminate the recursion unless masked. */
1548 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1549 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1550 {
1551 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1552 {
1553 /* Masked load. */
1554 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1555 nops = 1;
1556 }
1557 else
1558 {
1559 *max_nunits = this_max_nunits;
1560 (*tree_size)++;
1561 node = vect_create_new_slp_node (node, stmts, 0);
1562 SLP_TREE_VECTYPE (node) = vectype;
1563 /* And compute the load permutation. Whether it is actually
1564 a permutation depends on the unrolling factor which is
1565 decided later. */
1566 vec<unsigned> load_permutation;
1567 int j;
1568 stmt_vec_info load_info;
1569 load_permutation.create (group_size);
1570 stmt_vec_info first_stmt_info
1571 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1572 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1573 {
1574 int load_place = vect_get_place_in_interleaving_chain
1575 (load_info, first_stmt_info);
1576 gcc_assert (load_place != -1);
1577 load_permutation.safe_push (load_place);
1578 }
1579 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1580 return node;
1581 }
1582 }
1583 else if (gimple_assign_single_p (stmt_info->stmt)
1584 && !gimple_vuse (stmt_info->stmt)
1585 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1586 {
1587 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1588 the same SSA name vector of a compatible type to vectype. */
1589 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1590 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1591 stmt_vec_info estmt_info;
1592 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1593 {
1594 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1595 tree bfref = gimple_assign_rhs1 (estmt);
1596 HOST_WIDE_INT lane;
1597 if (!known_eq (bit_field_size (bfref),
1598 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1599 || !constant_multiple_p (bit_field_offset (bfref),
1600 bit_field_size (bfref), &lane))
1601 {
1602 lperm.release ();
1603 return NULL;
1604 }
1605 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1606 }
1607 slp_tree vnode = vect_create_new_slp_node (vNULL);
1608 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1609 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1610 /* We are always building a permutation node even if it is an identity
1611 permute to shield the rest of the vectorizer from the odd node
1612 representing an actual vector without any scalar ops.
1613 ??? We could hide it completely with making the permute node
1614 external? */
1615 node = vect_create_new_slp_node (node, stmts, 1);
1616 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1617 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1618 SLP_TREE_VECTYPE (node) = vectype;
1619 SLP_TREE_CHILDREN (node).quick_push (vnode);
1620 return node;
1621 }
1622
1623 /* Get at the operands, verifying they are compatible. */
1624 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1625 slp_oprnd_info oprnd_info;
1626 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1627 {
1628 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1629 stmts, i, &oprnds_info);
1630 if (res != 0)
1631 matches[(res == -1) ? 0 : i] = false;
1632 if (!matches[0])
1633 break;
1634 }
1635 for (i = 0; i < group_size; ++i)
1636 if (!matches[i])
1637 {
1638 vect_free_oprnd_info (oprnds_info);
1639 return NULL;
1640 }
1641 swap = NULL;
1642
1643 auto_vec<slp_tree, 4> children;
1644
1645 stmt_info = stmts[0];
1646
1647 /* Create SLP_TREE nodes for the definition node/s. */
1648 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1649 {
1650 slp_tree child;
1651 unsigned int j;
1652
1653 /* We're skipping certain operands from processing, for example
1654 outer loop reduction initial defs. */
1655 if (skip_args[i])
1656 {
1657 children.safe_push (NULL);
1658 continue;
1659 }
1660
1661 if (oprnd_info->first_dt == vect_uninitialized_def)
1662 {
1663 /* COND_EXPR have one too many eventually if the condition
1664 is a SSA name. */
1665 gcc_assert (i == 3 && nops == 4);
1666 continue;
1667 }
1668
1669 if (is_a <bb_vec_info> (vinfo)
1670 && oprnd_info->first_dt == vect_internal_def
1671 && !oprnd_info->any_pattern)
1672 {
1673 /* For BB vectorization, if all defs are the same do not
1674 bother to continue the build along the single-lane
1675 graph but use a splat of the scalar value. */
1676 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1677 for (j = 1; j < group_size; ++j)
1678 if (oprnd_info->def_stmts[j] != first_def)
1679 break;
1680 if (j == group_size
1681 /* But avoid doing this for loads where we may be
1682 able to CSE things, unless the stmt is not
1683 vectorizable. */
1684 && (!STMT_VINFO_VECTORIZABLE (first_def)
1685 || !gimple_vuse (first_def->stmt)))
1686 {
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE, vect_location,
1689 "Using a splat of the uniform operand\n");
1690 oprnd_info->first_dt = vect_external_def;
1691 }
1692 }
1693
1694 if (oprnd_info->first_dt == vect_external_def
1695 || oprnd_info->first_dt == vect_constant_def)
1696 {
1697 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1698 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1699 oprnd_info->ops = vNULL;
1700 children.safe_push (invnode);
1701 continue;
1702 }
1703
1704 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1705 group_size, &this_max_nunits,
1706 matches, limit,
1707 &this_tree_size, bst_map)) != NULL)
1708 {
1709 oprnd_info->def_stmts = vNULL;
1710 children.safe_push (child);
1711 continue;
1712 }
1713
1714 /* If the SLP build for operand zero failed and operand zero
1715 and one can be commutated try that for the scalar stmts
1716 that failed the match. */
1717 if (i == 0
1718 /* A first scalar stmt mismatch signals a fatal mismatch. */
1719 && matches[0]
1720 /* ??? For COND_EXPRs we can swap the comparison operands
1721 as well as the arms under some constraints. */
1722 && nops == 2
1723 && oprnds_info[1]->first_dt == vect_internal_def
1724 && is_gimple_assign (stmt_info->stmt)
1725 /* Swapping operands for reductions breaks assumptions later on. */
1726 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1727 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1728 {
1729 /* See whether we can swap the matching or the non-matching
1730 stmt operands. */
1731 bool swap_not_matching = true;
1732 do
1733 {
1734 for (j = 0; j < group_size; ++j)
1735 {
1736 if (matches[j] != !swap_not_matching)
1737 continue;
1738 stmt_vec_info stmt_info = stmts[j];
1739 /* Verify if we can swap operands of this stmt. */
1740 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1741 if (!stmt
1742 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1743 {
1744 if (!swap_not_matching)
1745 goto fail;
1746 swap_not_matching = false;
1747 break;
1748 }
1749 }
1750 }
1751 while (j != group_size);
1752
1753 /* Swap mismatched definition stmts. */
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE, vect_location,
1756 "Re-trying with swapped operands of stmts ");
1757 for (j = 0; j < group_size; ++j)
1758 if (matches[j] == !swap_not_matching)
1759 {
1760 std::swap (oprnds_info[0]->def_stmts[j],
1761 oprnds_info[1]->def_stmts[j]);
1762 std::swap (oprnds_info[0]->ops[j],
1763 oprnds_info[1]->ops[j]);
1764 if (dump_enabled_p ())
1765 dump_printf (MSG_NOTE, "%d ", j);
1766 }
1767 if (dump_enabled_p ())
1768 dump_printf (MSG_NOTE, "\n");
1769 /* And try again with scratch 'matches' ... */
1770 bool *tem = XALLOCAVEC (bool, group_size);
1771 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1772 group_size, &this_max_nunits,
1773 tem, limit,
1774 &this_tree_size, bst_map)) != NULL)
1775 {
1776 oprnd_info->def_stmts = vNULL;
1777 children.safe_push (child);
1778 continue;
1779 }
1780 }
1781 fail:
1782
1783 /* If the SLP build failed and we analyze a basic-block
1784 simply treat nodes we fail to build as externally defined
1785 (and thus build vectors from the scalar defs).
1786 The cost model will reject outright expensive cases.
1787 ??? This doesn't treat cases where permutation ultimatively
1788 fails (or we don't try permutation below). Ideally we'd
1789 even compute a permutation that will end up with the maximum
1790 SLP tree size... */
1791 if (is_a <bb_vec_info> (vinfo)
1792 /* ??? Rejecting patterns this way doesn't work. We'd have to
1793 do extra work to cancel the pattern so the uses see the
1794 scalar version. */
1795 && !is_pattern_stmt_p (stmt_info)
1796 && !oprnd_info->any_pattern)
1797 {
1798 /* But if there's a leading vector sized set of matching stmts
1799 fail here so we can split the group. This matches the condition
1800 vect_analyze_slp_instance uses. */
1801 /* ??? We might want to split here and combine the results to support
1802 multiple vector sizes better. */
1803 for (j = 0; j < group_size; ++j)
1804 if (!matches[j])
1805 break;
1806 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1807 {
1808 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_NOTE, vect_location,
1810 "Building vector operands from scalars\n");
1811 this_tree_size++;
1812 child = vect_create_new_slp_node (oprnd_info->ops);
1813 children.safe_push (child);
1814 oprnd_info->ops = vNULL;
1815 continue;
1816 }
1817 }
1818
1819 gcc_assert (child == NULL);
1820 FOR_EACH_VEC_ELT (children, j, child)
1821 if (child)
1822 vect_free_slp_tree (child);
1823 vect_free_oprnd_info (oprnds_info);
1824 return NULL;
1825 }
1826
1827 vect_free_oprnd_info (oprnds_info);
1828
1829 /* If we have all children of a child built up from uniform scalars
1830 or does more than one possibly expensive vector construction then
1831 just throw that away, causing it built up from scalars.
1832 The exception is the SLP node for the vector store. */
1833 if (is_a <bb_vec_info> (vinfo)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1835 /* ??? Rejecting patterns this way doesn't work. We'd have to
1836 do extra work to cancel the pattern so the uses see the
1837 scalar version. */
1838 && !is_pattern_stmt_p (stmt_info))
1839 {
1840 slp_tree child;
1841 unsigned j;
1842 bool all_uniform_p = true;
1843 unsigned n_vector_builds = 0;
1844 FOR_EACH_VEC_ELT (children, j, child)
1845 {
1846 if (!child)
1847 ;
1848 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1849 all_uniform_p = false;
1850 else if (!vect_slp_tree_uniform_p (child))
1851 {
1852 all_uniform_p = false;
1853 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1854 n_vector_builds++;
1855 }
1856 }
1857 if (all_uniform_p || n_vector_builds > 1)
1858 {
1859 /* Roll back. */
1860 matches[0] = false;
1861 FOR_EACH_VEC_ELT (children, j, child)
1862 if (child)
1863 vect_free_slp_tree (child);
1864
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE, vect_location,
1867 "Building parent vector operands from "
1868 "scalars instead\n");
1869 return NULL;
1870 }
1871 }
1872
1873 *tree_size += this_tree_size + 1;
1874 *max_nunits = this_max_nunits;
1875
1876 if (two_operators)
1877 {
1878 /* ??? We'd likely want to either cache in bst_map sth like
1879 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1880 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1881 explicit stmts to put in so the keying on 'stmts' doesn't
1882 work (but we have the same issue with nodes that use 'ops'). */
1883 slp_tree one = new _slp_tree;
1884 slp_tree two = new _slp_tree;
1885 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1886 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1887 SLP_TREE_VECTYPE (one) = vectype;
1888 SLP_TREE_VECTYPE (two) = vectype;
1889 SLP_TREE_CHILDREN (one).safe_splice (children);
1890 SLP_TREE_CHILDREN (two).safe_splice (children);
1891 slp_tree child;
1892 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1893 SLP_TREE_REF_COUNT (child)++;
1894
1895 /* Here we record the original defs since this
1896 node represents the final lane configuration. */
1897 node = vect_create_new_slp_node (node, stmts, 2);
1898 SLP_TREE_VECTYPE (node) = vectype;
1899 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1900 SLP_TREE_CHILDREN (node).quick_push (one);
1901 SLP_TREE_CHILDREN (node).quick_push (two);
1902 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1903 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1904 enum tree_code ocode = ERROR_MARK;
1905 stmt_vec_info ostmt_info;
1906 unsigned j = 0;
1907 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1908 {
1909 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1910 if (gimple_assign_rhs_code (ostmt) != code0)
1911 {
1912 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1913 ocode = gimple_assign_rhs_code (ostmt);
1914 j = i;
1915 }
1916 else
1917 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1918 }
1919 SLP_TREE_CODE (one) = code0;
1920 SLP_TREE_CODE (two) = ocode;
1921 SLP_TREE_LANES (one) = stmts.length ();
1922 SLP_TREE_LANES (two) = stmts.length ();
1923 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1924 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1925 return node;
1926 }
1927
1928 node = vect_create_new_slp_node (node, stmts, nops);
1929 SLP_TREE_VECTYPE (node) = vectype;
1930 SLP_TREE_CHILDREN (node).splice (children);
1931 return node;
1932 }
1933
1934 /* Dump a single SLP tree NODE. */
1935
1936 static void
1937 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1938 slp_tree node)
1939 {
1940 unsigned i, j;
1941 slp_tree child;
1942 stmt_vec_info stmt_info;
1943 tree op;
1944
1945 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1946 dump_user_location_t user_loc = loc.get_user_location ();
1947 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1948 SLP_TREE_DEF_TYPE (node) == vect_external_def
1949 ? " (external)"
1950 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1951 ? " (constant)"
1952 : ""), node,
1953 estimated_poly_value (node->max_nunits),
1954 SLP_TREE_REF_COUNT (node));
1955 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1956 {
1957 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
1958 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
1959 else
1960 dump_printf_loc (metadata, user_loc, "op template: %G",
1961 SLP_TREE_REPRESENTATIVE (node)->stmt);
1962 }
1963 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1964 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1965 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1966 else
1967 {
1968 dump_printf_loc (metadata, user_loc, "\t{ ");
1969 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1970 dump_printf (metadata, "%T%s ", op,
1971 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1972 dump_printf (metadata, "}\n");
1973 }
1974 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1975 {
1976 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1977 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1978 dump_printf (dump_kind, " %u", j);
1979 dump_printf (dump_kind, " }\n");
1980 }
1981 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1982 {
1983 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1984 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1985 dump_printf (dump_kind, " %u[%u]",
1986 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1987 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1988 dump_printf (dump_kind, " }\n");
1989 }
1990 if (SLP_TREE_CHILDREN (node).is_empty ())
1991 return;
1992 dump_printf_loc (metadata, user_loc, "\tchildren");
1993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1994 dump_printf (dump_kind, " %p", (void *)child);
1995 dump_printf (dump_kind, "\n");
1996 }
1997
1998 DEBUG_FUNCTION void
1999 debug (slp_tree node)
2000 {
2001 debug_dump_context ctx;
2002 vect_print_slp_tree (MSG_NOTE,
2003 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2004 node);
2005 }
2006
2007 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2008
2009 static void
2010 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2011 slp_tree node, hash_set<slp_tree> &visited)
2012 {
2013 unsigned i;
2014 slp_tree child;
2015
2016 if (visited.add (node))
2017 return;
2018
2019 vect_print_slp_tree (dump_kind, loc, node);
2020
2021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2022 if (child)
2023 vect_print_slp_graph (dump_kind, loc, child, visited);
2024 }
2025
2026 static void
2027 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2028 slp_tree entry)
2029 {
2030 hash_set<slp_tree> visited;
2031 vect_print_slp_graph (dump_kind, loc, entry, visited);
2032 }
2033
2034 /* Mark the tree rooted at NODE with PURE_SLP. */
2035
2036 static void
2037 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2038 {
2039 int i;
2040 stmt_vec_info stmt_info;
2041 slp_tree child;
2042
2043 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2044 return;
2045
2046 if (visited.add (node))
2047 return;
2048
2049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2050 STMT_SLP_TYPE (stmt_info) = pure_slp;
2051
2052 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2053 if (child)
2054 vect_mark_slp_stmts (child, visited);
2055 }
2056
2057 static void
2058 vect_mark_slp_stmts (slp_tree node)
2059 {
2060 hash_set<slp_tree> visited;
2061 vect_mark_slp_stmts (node, visited);
2062 }
2063
2064 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2065
2066 static void
2067 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2068 {
2069 int i;
2070 stmt_vec_info stmt_info;
2071 slp_tree child;
2072
2073 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2074 return;
2075
2076 if (visited.add (node))
2077 return;
2078
2079 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2080 {
2081 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2082 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2083 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2084 }
2085
2086 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2087 if (child)
2088 vect_mark_slp_stmts_relevant (child, visited);
2089 }
2090
2091 static void
2092 vect_mark_slp_stmts_relevant (slp_tree node)
2093 {
2094 hash_set<slp_tree> visited;
2095 vect_mark_slp_stmts_relevant (node, visited);
2096 }
2097
2098
2099 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2100
2101 static void
2102 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2103 hash_set<slp_tree> &visited)
2104 {
2105 if (!node || visited.add (node))
2106 return;
2107
2108 if (SLP_TREE_CHILDREN (node).length () == 0)
2109 {
2110 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2111 return;
2112 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2113 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2114 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2115 loads.safe_push (node);
2116 }
2117 else
2118 {
2119 unsigned i;
2120 slp_tree child;
2121 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2122 vect_gather_slp_loads (loads, child, visited);
2123 }
2124 }
2125
2126
2127 /* Find the last store in SLP INSTANCE. */
2128
2129 stmt_vec_info
2130 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2131 {
2132 stmt_vec_info last = NULL;
2133 stmt_vec_info stmt_vinfo;
2134
2135 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2136 {
2137 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2138 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2139 }
2140
2141 return last;
2142 }
2143
2144 /* Find the first stmt in NODE. */
2145
2146 stmt_vec_info
2147 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2148 {
2149 stmt_vec_info first = NULL;
2150 stmt_vec_info stmt_vinfo;
2151
2152 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2153 {
2154 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2155 if (!first
2156 || get_later_stmt (stmt_vinfo, first) == first)
2157 first = stmt_vinfo;
2158 }
2159
2160 return first;
2161 }
2162
2163 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2164 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2165 (also containing the first GROUP1_SIZE stmts, since stores are
2166 consecutive), the second containing the remainder.
2167 Return the first stmt in the second group. */
2168
2169 static stmt_vec_info
2170 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2171 {
2172 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2173 gcc_assert (group1_size > 0);
2174 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2175 gcc_assert (group2_size > 0);
2176 DR_GROUP_SIZE (first_vinfo) = group1_size;
2177
2178 stmt_vec_info stmt_info = first_vinfo;
2179 for (unsigned i = group1_size; i > 1; i--)
2180 {
2181 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2182 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2183 }
2184 /* STMT is now the last element of the first group. */
2185 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2186 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2187
2188 DR_GROUP_SIZE (group2) = group2_size;
2189 for (stmt_info = group2; stmt_info;
2190 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2191 {
2192 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2193 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2194 }
2195
2196 /* For the second group, the DR_GROUP_GAP is that before the original group,
2197 plus skipping over the first vector. */
2198 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2199
2200 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2201 DR_GROUP_GAP (first_vinfo) += group2_size;
2202
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2205 group1_size, group2_size);
2206
2207 return group2;
2208 }
2209
2210 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2211 statements and a vector of NUNITS elements. */
2212
2213 static poly_uint64
2214 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2215 {
2216 return exact_div (common_multiple (nunits, group_size), group_size);
2217 }
2218
2219 static bool
2220 vect_analyze_slp_instance (vec_info *vinfo,
2221 scalar_stmts_to_slp_tree_map_t *bst_map,
2222 stmt_vec_info stmt_info, slp_instance_kind kind,
2223 unsigned max_tree_size, unsigned *limit);
2224
2225 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2226 of KIND. Return true if successful. */
2227
2228 static bool
2229 vect_build_slp_instance (vec_info *vinfo,
2230 slp_instance_kind kind,
2231 vec<stmt_vec_info> &scalar_stmts,
2232 stmt_vec_info root_stmt_info,
2233 unsigned max_tree_size, unsigned *limit,
2234 scalar_stmts_to_slp_tree_map_t *bst_map,
2235 /* ??? We need stmt_info for group splitting. */
2236 stmt_vec_info stmt_info_)
2237 {
2238 if (dump_enabled_p ())
2239 {
2240 dump_printf_loc (MSG_NOTE, vect_location,
2241 "Starting SLP discovery for\n");
2242 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 " %G", scalar_stmts[i]->stmt);
2245 }
2246
2247 /* Build the tree for the SLP instance. */
2248 unsigned int group_size = scalar_stmts.length ();
2249 bool *matches = XALLOCAVEC (bool, group_size);
2250 poly_uint64 max_nunits = 1;
2251 unsigned tree_size = 0;
2252 unsigned i;
2253 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2254 &max_nunits, matches, limit,
2255 &tree_size, bst_map);
2256 if (node != NULL)
2257 {
2258 /* Calculate the unrolling factor based on the smallest type. */
2259 poly_uint64 unrolling_factor
2260 = calculate_unrolling_factor (max_nunits, group_size);
2261
2262 if (maybe_ne (unrolling_factor, 1U)
2263 && is_a <bb_vec_info> (vinfo))
2264 {
2265 unsigned HOST_WIDE_INT const_max_nunits;
2266 if (!max_nunits.is_constant (&const_max_nunits)
2267 || const_max_nunits > group_size)
2268 {
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 "Build SLP failed: store group "
2272 "size not a multiple of the vector size "
2273 "in basic block SLP\n");
2274 vect_free_slp_tree (node);
2275 return false;
2276 }
2277 /* Fatal mismatch. */
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE, vect_location,
2280 "SLP discovery succeeded but node needs "
2281 "splitting\n");
2282 memset (matches, true, group_size);
2283 matches[group_size / const_max_nunits * const_max_nunits] = false;
2284 vect_free_slp_tree (node);
2285 }
2286 else
2287 {
2288 /* Create a new SLP instance. */
2289 slp_instance new_instance = XNEW (class _slp_instance);
2290 SLP_INSTANCE_TREE (new_instance) = node;
2291 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2292 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2293 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2294 SLP_INSTANCE_KIND (new_instance) = kind;
2295 new_instance->reduc_phis = NULL;
2296 new_instance->cost_vec = vNULL;
2297 new_instance->subgraph_entries = vNULL;
2298
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_NOTE, vect_location,
2301 "SLP size %u vs. limit %u.\n",
2302 tree_size, max_tree_size);
2303
2304 /* Fixup SLP reduction chains. */
2305 if (kind == slp_inst_kind_reduc_chain)
2306 {
2307 /* If this is a reduction chain with a conversion in front
2308 amend the SLP tree with a node for that. */
2309 gimple *scalar_def
2310 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2311 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2312 {
2313 /* Get at the conversion stmt - we know it's the single use
2314 of the last stmt of the reduction chain. */
2315 use_operand_p use_p;
2316 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2317 &use_p, &scalar_def);
2318 gcc_assert (r);
2319 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2320 next_info = vect_stmt_to_vectorize (next_info);
2321 scalar_stmts = vNULL;
2322 scalar_stmts.create (group_size);
2323 for (unsigned i = 0; i < group_size; ++i)
2324 scalar_stmts.quick_push (next_info);
2325 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2326 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2327 SLP_TREE_CHILDREN (conv).quick_push (node);
2328 SLP_INSTANCE_TREE (new_instance) = conv;
2329 /* We also have to fake this conversion stmt as SLP reduction
2330 group so we don't have to mess with too much code
2331 elsewhere. */
2332 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2333 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2334 }
2335 /* Fill the backedge child of the PHI SLP node. The
2336 general matching code cannot find it because the
2337 scalar code does not reflect how we vectorize the
2338 reduction. */
2339 use_operand_p use_p;
2340 imm_use_iterator imm_iter;
2341 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2342 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2343 gimple_get_lhs (scalar_def))
2344 /* There are exactly two non-debug uses, the reduction
2345 PHI and the loop-closed PHI node. */
2346 if (!is_gimple_debug (USE_STMT (use_p))
2347 && gimple_bb (USE_STMT (use_p)) == loop->header)
2348 {
2349 auto_vec<stmt_vec_info, 64> phis (group_size);
2350 stmt_vec_info phi_info
2351 = vinfo->lookup_stmt (USE_STMT (use_p));
2352 for (unsigned i = 0; i < group_size; ++i)
2353 phis.quick_push (phi_info);
2354 slp_tree *phi_node = bst_map->get (phis);
2355 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2356 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2357 = SLP_INSTANCE_TREE (new_instance);
2358 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2359 }
2360 }
2361
2362 vinfo->slp_instances.safe_push (new_instance);
2363
2364 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2365 the number of scalar stmts in the root in a few places.
2366 Verify that assumption holds. */
2367 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2368 .length () == group_size);
2369
2370 if (dump_enabled_p ())
2371 {
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "Final SLP tree for instance %p:\n", new_instance);
2374 vect_print_slp_graph (MSG_NOTE, vect_location,
2375 SLP_INSTANCE_TREE (new_instance));
2376 }
2377
2378 return true;
2379 }
2380 }
2381 else
2382 {
2383 /* Failed to SLP. */
2384 /* Free the allocated memory. */
2385 scalar_stmts.release ();
2386 }
2387
2388 stmt_vec_info stmt_info = stmt_info_;
2389 /* Try to break the group up into pieces. */
2390 if (kind == slp_inst_kind_store)
2391 {
2392 /* ??? We could delay all the actual splitting of store-groups
2393 until after SLP discovery of the original group completed.
2394 Then we can recurse to vect_build_slp_instance directly. */
2395 for (i = 0; i < group_size; i++)
2396 if (!matches[i])
2397 break;
2398
2399 /* For basic block SLP, try to break the group up into multiples of
2400 a vector size. */
2401 if (is_a <bb_vec_info> (vinfo)
2402 && (i > 1 && i < group_size))
2403 {
2404 tree scalar_type
2405 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2406 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2407 1 << floor_log2 (i));
2408 unsigned HOST_WIDE_INT const_nunits;
2409 if (vectype
2410 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2411 {
2412 /* Split into two groups at the first vector boundary. */
2413 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2414 unsigned group1_size = i & ~(const_nunits - 1);
2415
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_NOTE, vect_location,
2418 "Splitting SLP group at stmt %u\n", i);
2419 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2420 group1_size);
2421 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2422 kind, max_tree_size,
2423 limit);
2424 /* Split the rest at the failure point and possibly
2425 re-analyze the remaining matching part if it has
2426 at least two lanes. */
2427 if (group1_size < i
2428 && (i + 1 < group_size
2429 || i - group1_size > 1))
2430 {
2431 stmt_vec_info rest2 = rest;
2432 rest = vect_split_slp_store_group (rest, i - group1_size);
2433 if (i - group1_size > 1)
2434 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2435 kind, max_tree_size,
2436 limit);
2437 }
2438 /* Re-analyze the non-matching tail if it has at least
2439 two lanes. */
2440 if (i + 1 < group_size)
2441 res |= vect_analyze_slp_instance (vinfo, bst_map,
2442 rest, kind, max_tree_size,
2443 limit);
2444 return res;
2445 }
2446 }
2447
2448 /* For loop vectorization split into arbitrary pieces of size > 1. */
2449 if (is_a <loop_vec_info> (vinfo)
2450 && (i > 1 && i < group_size))
2451 {
2452 unsigned group1_size = i;
2453
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_NOTE, vect_location,
2456 "Splitting SLP group at stmt %u\n", i);
2457
2458 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2459 group1_size);
2460 /* Loop vectorization cannot handle gaps in stores, make sure
2461 the split group appears as strided. */
2462 STMT_VINFO_STRIDED_P (rest) = 1;
2463 DR_GROUP_GAP (rest) = 0;
2464 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2465 DR_GROUP_GAP (stmt_info) = 0;
2466
2467 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2468 kind, max_tree_size, limit);
2469 if (i + 1 < group_size)
2470 res |= vect_analyze_slp_instance (vinfo, bst_map,
2471 rest, kind, max_tree_size, limit);
2472
2473 return res;
2474 }
2475
2476 /* Even though the first vector did not all match, we might be able to SLP
2477 (some) of the remainder. FORNOW ignore this possibility. */
2478 }
2479
2480 /* Failed to SLP. */
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2483 return false;
2484 }
2485
2486
2487 /* Analyze an SLP instance starting from a group of grouped stores. Call
2488 vect_build_slp_tree to build a tree of packed stmts if possible.
2489 Return FALSE if it's impossible to SLP any stmt in the loop. */
2490
2491 static bool
2492 vect_analyze_slp_instance (vec_info *vinfo,
2493 scalar_stmts_to_slp_tree_map_t *bst_map,
2494 stmt_vec_info stmt_info,
2495 slp_instance_kind kind,
2496 unsigned max_tree_size, unsigned *limit)
2497 {
2498 unsigned int i;
2499 vec<stmt_vec_info> scalar_stmts;
2500
2501 if (is_a <bb_vec_info> (vinfo))
2502 vect_location = stmt_info->stmt;
2503
2504 stmt_vec_info next_info = stmt_info;
2505 if (kind == slp_inst_kind_store)
2506 {
2507 /* Collect the stores and store them in scalar_stmts. */
2508 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2509 while (next_info)
2510 {
2511 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2512 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2513 }
2514 }
2515 else if (kind == slp_inst_kind_reduc_chain)
2516 {
2517 /* Collect the reduction stmts and store them in scalar_stmts. */
2518 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2519 while (next_info)
2520 {
2521 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2522 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2523 }
2524 /* Mark the first element of the reduction chain as reduction to properly
2525 transform the node. In the reduction analysis phase only the last
2526 element of the chain is marked as reduction. */
2527 STMT_VINFO_DEF_TYPE (stmt_info)
2528 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2529 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2530 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2531 }
2532 else if (kind == slp_inst_kind_ctor)
2533 {
2534 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2535 tree val;
2536 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2537 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2538 {
2539 stmt_vec_info def_info = vinfo->lookup_def (val);
2540 def_info = vect_stmt_to_vectorize (def_info);
2541 scalar_stmts.quick_push (def_info);
2542 }
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_NOTE, vect_location,
2545 "Analyzing vectorizable constructor: %G\n",
2546 stmt_info->stmt);
2547 }
2548 else if (kind == slp_inst_kind_reduc_group)
2549 {
2550 /* Collect reduction statements. */
2551 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2552 scalar_stmts.create (reductions.length ());
2553 for (i = 0; reductions.iterate (i, &next_info); i++)
2554 if (STMT_VINFO_RELEVANT_P (next_info)
2555 || STMT_VINFO_LIVE_P (next_info))
2556 scalar_stmts.quick_push (next_info);
2557 /* If less than two were relevant/live there's nothing to SLP. */
2558 if (scalar_stmts.length () < 2)
2559 return false;
2560 }
2561 else
2562 gcc_unreachable ();
2563
2564 /* Build the tree for the SLP instance. */
2565 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2566 kind == slp_inst_kind_ctor
2567 ? stmt_info : NULL,
2568 max_tree_size, limit, bst_map,
2569 kind == slp_inst_kind_store
2570 ? stmt_info : NULL);
2571
2572 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2573 where we should do store group splitting. */
2574
2575 return res;
2576 }
2577
2578 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2579 trees of packed scalar stmts if SLP is possible. */
2580
2581 opt_result
2582 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2583 {
2584 unsigned int i;
2585 stmt_vec_info first_element;
2586
2587 DUMP_VECT_SCOPE ("vect_analyze_slp");
2588
2589 unsigned limit = max_tree_size;
2590
2591 scalar_stmts_to_slp_tree_map_t *bst_map
2592 = new scalar_stmts_to_slp_tree_map_t ();
2593
2594 /* Find SLP sequences starting from groups of grouped stores. */
2595 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2596 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2597 STMT_VINFO_GROUPED_ACCESS (first_element)
2598 ? slp_inst_kind_store : slp_inst_kind_ctor,
2599 max_tree_size, &limit);
2600
2601 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2602 {
2603 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2604 {
2605 vect_location = bb_vinfo->roots[i].root->stmt;
2606 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2607 bb_vinfo->roots[i].stmts,
2608 bb_vinfo->roots[i].root,
2609 max_tree_size, &limit, bst_map, NULL))
2610 bb_vinfo->roots[i].stmts = vNULL;
2611 }
2612 }
2613
2614 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2615 {
2616 /* Find SLP sequences starting from reduction chains. */
2617 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2618 if (! STMT_VINFO_RELEVANT_P (first_element)
2619 && ! STMT_VINFO_LIVE_P (first_element))
2620 ;
2621 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2622 slp_inst_kind_reduc_chain,
2623 max_tree_size, &limit))
2624 {
2625 /* Dissolve reduction chain group. */
2626 stmt_vec_info vinfo = first_element;
2627 stmt_vec_info last = NULL;
2628 while (vinfo)
2629 {
2630 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2631 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2632 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2633 last = vinfo;
2634 vinfo = next;
2635 }
2636 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2637 /* It can be still vectorized as part of an SLP reduction. */
2638 loop_vinfo->reductions.safe_push (last);
2639 }
2640
2641 /* Find SLP sequences starting from groups of reductions. */
2642 if (loop_vinfo->reductions.length () > 1)
2643 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2644 slp_inst_kind_reduc_group, max_tree_size,
2645 &limit);
2646 }
2647
2648 /* The map keeps a reference on SLP nodes built, release that. */
2649 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2650 it != bst_map->end (); ++it)
2651 if ((*it).second)
2652 vect_free_slp_tree ((*it).second);
2653 delete bst_map;
2654
2655 return opt_result::success ();
2656 }
2657
2658 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2659
2660 static void
2661 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2662 vec<slp_tree> &vertices, vec<int> &leafs)
2663 {
2664 unsigned i;
2665 slp_tree child;
2666
2667 if (visited.add (node))
2668 return;
2669
2670 node->vertex = vertices.length ();
2671 vertices.safe_push (node);
2672
2673 bool leaf = true;
2674 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2675 if (child)
2676 {
2677 leaf = false;
2678 vect_slp_build_vertices (visited, child, vertices, leafs);
2679 }
2680 if (leaf)
2681 leafs.safe_push (node->vertex);
2682 }
2683
2684 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2685
2686 static void
2687 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2688 vec<int> &leafs)
2689 {
2690 hash_set<slp_tree> visited;
2691 unsigned i;
2692 slp_instance instance;
2693 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2694 {
2695 unsigned n_v = vertices.length ();
2696 unsigned n_l = leafs.length ();
2697 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2698 leafs);
2699 /* If we added vertices but no entries to the reverse graph we've
2700 added a cycle that is not backwards-reachable. Push the entry
2701 to mimic as leaf then. */
2702 if (vertices.length () > n_v
2703 && leafs.length () == n_l)
2704 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2705 }
2706 }
2707
2708 /* Apply (reverse) bijectite PERM to VEC. */
2709
2710 template <class T>
2711 static void
2712 vect_slp_permute (vec<unsigned> perm,
2713 vec<T> &vec, bool reverse)
2714 {
2715 auto_vec<T, 64> saved;
2716 saved.create (vec.length ());
2717 for (unsigned i = 0; i < vec.length (); ++i)
2718 saved.quick_push (vec[i]);
2719
2720 if (reverse)
2721 {
2722 for (unsigned i = 0; i < vec.length (); ++i)
2723 vec[perm[i]] = saved[i];
2724 for (unsigned i = 0; i < vec.length (); ++i)
2725 gcc_assert (vec[perm[i]] == saved[i]);
2726 }
2727 else
2728 {
2729 for (unsigned i = 0; i < vec.length (); ++i)
2730 vec[i] = saved[perm[i]];
2731 for (unsigned i = 0; i < vec.length (); ++i)
2732 gcc_assert (vec[i] == saved[perm[i]]);
2733 }
2734 }
2735
2736 /* Return whether permutations PERM_A and PERM_B as recorded in the
2737 PERMS vector are equal. */
2738
2739 static bool
2740 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2741 int perm_a, int perm_b)
2742 {
2743 return (perm_a == perm_b
2744 || (perms[perm_a].length () == perms[perm_b].length ()
2745 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2746 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2747 }
2748
2749 /* Optimize the SLP graph of VINFO. */
2750
2751 void
2752 vect_optimize_slp (vec_info *vinfo)
2753 {
2754 if (vinfo->slp_instances.is_empty ())
2755 return;
2756
2757 slp_tree node;
2758 unsigned i;
2759 auto_vec<slp_tree> vertices;
2760 auto_vec<int> leafs;
2761 vect_slp_build_vertices (vinfo, vertices, leafs);
2762
2763 struct graph *slpg = new_graph (vertices.length ());
2764 FOR_EACH_VEC_ELT (vertices, i, node)
2765 {
2766 unsigned j;
2767 slp_tree child;
2768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2769 if (child)
2770 add_edge (slpg, i, child->vertex);
2771 }
2772
2773 /* Compute (reverse) postorder on the inverted graph. */
2774 auto_vec<int> ipo;
2775 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2776
2777 auto_sbitmap n_visited (vertices.length ());
2778 auto_sbitmap n_materialize (vertices.length ());
2779 auto_vec<int> n_perm (vertices.length ());
2780 auto_vec<vec<unsigned> > perms;
2781
2782 bitmap_clear (n_visited);
2783 bitmap_clear (n_materialize);
2784 n_perm.quick_grow_cleared (vertices.length ());
2785 perms.safe_push (vNULL); /* zero is no permute */
2786
2787 /* Produce initial permutations. */
2788 for (i = 0; i < leafs.length (); ++i)
2789 {
2790 int idx = leafs[i];
2791 slp_tree node = vertices[idx];
2792
2793 /* Handle externals and constants optimistically throughout the
2794 iteration. */
2795 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2796 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2797 continue;
2798
2799 /* Leafs do not change across iterations. Note leafs also double
2800 as entries to the reverse graph. */
2801 if (!slpg->vertices[idx].succ)
2802 bitmap_set_bit (n_visited, idx);
2803 /* Loads are the only thing generating permutes. */
2804 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2805 continue;
2806
2807 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2808 node unpermuted, record this permute. */
2809 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2810 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2811 continue;
2812 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2813 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2814 bool any_permute = false;
2815 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2816 {
2817 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2818 imin = MIN (imin, idx);
2819 imax = MAX (imax, idx);
2820 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2821 any_permute = true;
2822 }
2823 /* If there's no permute no need to split one out. */
2824 if (!any_permute)
2825 continue;
2826 /* If the span doesn't match we'd disrupt VF computation, avoid
2827 that for now. */
2828 if (imax - imin + 1 != SLP_TREE_LANES (node))
2829 continue;
2830
2831 /* For now only handle true permutes, like
2832 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2833 when permuting constants and invariants keeping the permute
2834 bijective. */
2835 auto_sbitmap load_index (SLP_TREE_LANES (node));
2836 bitmap_clear (load_index);
2837 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2838 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2839 unsigned j;
2840 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2841 if (!bitmap_bit_p (load_index, j))
2842 break;
2843 if (j != SLP_TREE_LANES (node))
2844 continue;
2845
2846 vec<unsigned> perm = vNULL;
2847 perm.safe_grow (SLP_TREE_LANES (node), true);
2848 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2849 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2850 perms.safe_push (perm);
2851 n_perm[idx] = perms.length () - 1;
2852 }
2853
2854 /* Propagate permutes along the graph and compute materialization points. */
2855 bool changed;
2856 unsigned iteration = 0;
2857 do
2858 {
2859 changed = false;
2860 ++iteration;
2861
2862 for (i = vertices.length (); i > 0 ; --i)
2863 {
2864 int idx = ipo[i-1];
2865 slp_tree node = vertices[idx];
2866 /* For leafs there's nothing to do - we've seeded permutes
2867 on those above. */
2868 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2869 continue;
2870
2871 bitmap_set_bit (n_visited, idx);
2872
2873 /* We cannot move a permute across a store. */
2874 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2875 && DR_IS_WRITE
2876 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2877 continue;
2878
2879 int perm = -1;
2880 for (graph_edge *succ = slpg->vertices[idx].succ;
2881 succ; succ = succ->succ_next)
2882 {
2883 int succ_idx = succ->dest;
2884 /* Handle unvisited nodes optimistically. */
2885 /* ??? But for constants once we want to handle non-bijective
2886 permutes we have to verify the permute, when unifying lanes,
2887 will not unify different constants. For example see
2888 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2889 if (!bitmap_bit_p (n_visited, succ_idx))
2890 continue;
2891 int succ_perm = n_perm[succ_idx];
2892 /* Once we materialize succs permutation its output lanes
2893 appear unpermuted to us. */
2894 if (bitmap_bit_p (n_materialize, succ_idx))
2895 succ_perm = 0;
2896 if (perm == -1)
2897 perm = succ_perm;
2898 else if (succ_perm == 0)
2899 {
2900 perm = 0;
2901 break;
2902 }
2903 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2904 {
2905 perm = 0;
2906 break;
2907 }
2908 }
2909
2910 if (perm == -1)
2911 /* Pick up pre-computed leaf values. */
2912 perm = n_perm[idx];
2913 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2914 {
2915 if (iteration > 1)
2916 /* Make sure we eventually converge. */
2917 gcc_checking_assert (perm == 0);
2918 n_perm[idx] = perm;
2919 if (perm == 0)
2920 bitmap_clear_bit (n_materialize, idx);
2921 changed = true;
2922 }
2923
2924 if (perm == 0)
2925 continue;
2926
2927 /* Elide pruning at materialization points in the first
2928 iteration so every node was visited once at least. */
2929 if (iteration == 1)
2930 continue;
2931
2932 /* Decide on permute materialization. Look whether there's
2933 a use (pred) edge that is permuted differently than us.
2934 In that case mark ourselves so the permutation is applied. */
2935 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2936 for (graph_edge *pred = slpg->vertices[idx].pred;
2937 pred; pred = pred->pred_next)
2938 {
2939 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2940 int pred_perm = n_perm[pred->src];
2941 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2942 {
2943 all_preds_permuted = false;
2944 break;
2945 }
2946 }
2947 if (!all_preds_permuted)
2948 {
2949 if (!bitmap_bit_p (n_materialize, idx))
2950 changed = true;
2951 bitmap_set_bit (n_materialize, idx);
2952 }
2953 }
2954 }
2955 while (changed || iteration == 1);
2956
2957 /* Materialize. */
2958 for (i = 0; i < vertices.length (); ++i)
2959 {
2960 int perm = n_perm[i];
2961 if (perm <= 0)
2962 continue;
2963
2964 slp_tree node = vertices[i];
2965
2966 /* First permute invariant/external original successors. */
2967 unsigned j;
2968 slp_tree child;
2969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2970 {
2971 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2972 continue;
2973
2974 /* If the vector is uniform there's nothing to do. */
2975 if (vect_slp_tree_uniform_p (child))
2976 continue;
2977
2978 /* We can end up sharing some externals via two_operator
2979 handling. Be prepared to unshare those. */
2980 if (child->refcnt != 1)
2981 {
2982 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2983 SLP_TREE_CHILDREN (node)[j] = child
2984 = vect_create_new_slp_node
2985 (SLP_TREE_SCALAR_OPS (child).copy ());
2986 }
2987 vect_slp_permute (perms[perm],
2988 SLP_TREE_SCALAR_OPS (child), true);
2989 }
2990
2991 if (bitmap_bit_p (n_materialize, i))
2992 {
2993 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2994 /* For loads simply drop the permutation, the load permutation
2995 already performs the desired permutation. */
2996 ;
2997 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2998 {
2999 /* If the node if already a permute node we just need to apply
3000 the permutation to the permute node itself. */
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE, vect_location,
3003 "simplifying permute node %p\n",
3004 node);
3005
3006 vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
3007 true);
3008 }
3009 else
3010 {
3011 if (dump_enabled_p ())
3012 dump_printf_loc (MSG_NOTE, vect_location,
3013 "inserting permute node in place of %p\n",
3014 node);
3015
3016 /* Make a copy of NODE and in-place change it to a
3017 VEC_PERM node to permute the lanes of the copy. */
3018 slp_tree copy = new _slp_tree;
3019 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3020 SLP_TREE_CHILDREN (node) = vNULL;
3021 SLP_TREE_SCALAR_STMTS (copy)
3022 = SLP_TREE_SCALAR_STMTS (node).copy ();
3023 vect_slp_permute (perms[perm],
3024 SLP_TREE_SCALAR_STMTS (copy), true);
3025 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3026 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3027 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3028 SLP_TREE_LANE_PERMUTATION (copy)
3029 = SLP_TREE_LANE_PERMUTATION (node);
3030 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3031 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3032 copy->refcnt = 1;
3033 copy->max_nunits = node->max_nunits;
3034 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3035 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3036 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3037
3038 /* Now turn NODE into a VEC_PERM. */
3039 SLP_TREE_CHILDREN (node).safe_push (copy);
3040 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3041 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3042 SLP_TREE_LANE_PERMUTATION (node)
3043 .quick_push (std::make_pair (0, perms[perm][j]));
3044 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3045 }
3046 }
3047 else
3048 {
3049 /* Apply the reverse permutation to our stmts. */
3050 vect_slp_permute (perms[perm],
3051 SLP_TREE_SCALAR_STMTS (node), true);
3052 /* And to the load permutation, which we can simply
3053 make regular by design. */
3054 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3055 {
3056 /* ??? When we handle non-bijective permutes the idea
3057 is that we can force the load-permutation to be
3058 { min, min + 1, min + 2, ... max }. But then the
3059 scalar defs might no longer match the lane content
3060 which means wrong-code with live lane vectorization.
3061 So we possibly have to have NULL entries for those. */
3062 vect_slp_permute (perms[perm],
3063 SLP_TREE_LOAD_PERMUTATION (node), true);
3064 }
3065 }
3066 }
3067
3068 /* Free the perms vector used for propagation. */
3069 while (!perms.is_empty ())
3070 perms.pop ().release ();
3071 free_graph (slpg);
3072
3073
3074 /* Now elide load permutations that are not necessary. */
3075 for (i = 0; i < leafs.length (); ++i)
3076 {
3077 node = vertices[leafs[i]];
3078 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3079 continue;
3080
3081 /* In basic block vectorization we allow any subchain of an interleaving
3082 chain.
3083 FORNOW: not in loop SLP because of realignment complications. */
3084 if (is_a <bb_vec_info> (vinfo))
3085 {
3086 bool subchain_p = true;
3087 stmt_vec_info next_load_info = NULL;
3088 stmt_vec_info load_info;
3089 unsigned j;
3090 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3091 {
3092 if (j != 0
3093 && (next_load_info != load_info
3094 || DR_GROUP_GAP (load_info) != 1))
3095 {
3096 subchain_p = false;
3097 break;
3098 }
3099 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3100 }
3101 if (subchain_p)
3102 {
3103 SLP_TREE_LOAD_PERMUTATION (node).release ();
3104 continue;
3105 }
3106 }
3107 else
3108 {
3109 stmt_vec_info load_info;
3110 bool this_load_permuted = false;
3111 unsigned j;
3112 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3113 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3114 {
3115 this_load_permuted = true;
3116 break;
3117 }
3118 stmt_vec_info first_stmt_info
3119 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3120 if (!this_load_permuted
3121 /* The load requires permutation when unrolling exposes
3122 a gap either because the group is larger than the SLP
3123 group-size or because there is a gap between the groups. */
3124 && (known_eq (LOOP_VINFO_VECT_FACTOR
3125 (as_a <loop_vec_info> (vinfo)), 1U)
3126 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3127 && DR_GROUP_GAP (first_stmt_info) == 0)))
3128 {
3129 SLP_TREE_LOAD_PERMUTATION (node).release ();
3130 continue;
3131 }
3132 }
3133 }
3134 }
3135
3136 /* Gather loads reachable from the individual SLP graph entries. */
3137
3138 void
3139 vect_gather_slp_loads (vec_info *vinfo)
3140 {
3141 unsigned i;
3142 slp_instance instance;
3143 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3144 {
3145 hash_set<slp_tree> visited;
3146 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3147 SLP_INSTANCE_TREE (instance), visited);
3148 }
3149 }
3150
3151
3152 /* For each possible SLP instance decide whether to SLP it and calculate overall
3153 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3154 least one instance. */
3155
3156 bool
3157 vect_make_slp_decision (loop_vec_info loop_vinfo)
3158 {
3159 unsigned int i;
3160 poly_uint64 unrolling_factor = 1;
3161 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3162 slp_instance instance;
3163 int decided_to_slp = 0;
3164
3165 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3166
3167 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3168 {
3169 /* FORNOW: SLP if you can. */
3170 /* All unroll factors have the form:
3171
3172 GET_MODE_SIZE (vinfo->vector_mode) * X
3173
3174 for some rational X, so they must have a common multiple. */
3175 unrolling_factor
3176 = force_common_multiple (unrolling_factor,
3177 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3178
3179 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3180 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3181 loop-based vectorization. Such stmts will be marked as HYBRID. */
3182 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3183 decided_to_slp++;
3184 }
3185
3186 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3187
3188 if (decided_to_slp && dump_enabled_p ())
3189 {
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "Decided to SLP %d instances. Unrolling factor ",
3192 decided_to_slp);
3193 dump_dec (MSG_NOTE, unrolling_factor);
3194 dump_printf (MSG_NOTE, "\n");
3195 }
3196
3197 return (decided_to_slp > 0);
3198 }
3199
3200 /* Private data for vect_detect_hybrid_slp. */
3201 struct vdhs_data
3202 {
3203 loop_vec_info loop_vinfo;
3204 vec<stmt_vec_info> *worklist;
3205 };
3206
3207 /* Walker for walk_gimple_op. */
3208
3209 static tree
3210 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3211 {
3212 walk_stmt_info *wi = (walk_stmt_info *)data;
3213 vdhs_data *dat = (vdhs_data *)wi->info;
3214
3215 if (wi->is_lhs)
3216 return NULL_TREE;
3217
3218 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3219 if (!def_stmt_info)
3220 return NULL_TREE;
3221 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3222 if (PURE_SLP_STMT (def_stmt_info))
3223 {
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3226 def_stmt_info->stmt);
3227 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3228 dat->worklist->safe_push (def_stmt_info);
3229 }
3230
3231 return NULL_TREE;
3232 }
3233
3234 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3235 if so, otherwise pushing it to WORKLIST. */
3236
3237 static void
3238 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3239 vec<stmt_vec_info> &worklist,
3240 stmt_vec_info stmt_info)
3241 {
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE, vect_location,
3244 "Processing hybrid candidate : %G", stmt_info->stmt);
3245 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3246 imm_use_iterator iter2;
3247 ssa_op_iter iter1;
3248 use_operand_p use_p;
3249 def_operand_p def_p;
3250 bool any_def = false;
3251 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3252 {
3253 any_def = true;
3254 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3255 {
3256 if (is_gimple_debug (USE_STMT (use_p)))
3257 continue;
3258 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3259 /* An out-of loop use means this is a loop_vect sink. */
3260 if (!use_info)
3261 {
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_NOTE, vect_location,
3264 "Found loop_vect sink: %G", stmt_info->stmt);
3265 worklist.safe_push (stmt_info);
3266 return;
3267 }
3268 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3269 {
3270 if (dump_enabled_p ())
3271 dump_printf_loc (MSG_NOTE, vect_location,
3272 "Found loop_vect use: %G", use_info->stmt);
3273 worklist.safe_push (stmt_info);
3274 return;
3275 }
3276 }
3277 }
3278 /* No def means this is a loo_vect sink. */
3279 if (!any_def)
3280 {
3281 if (dump_enabled_p ())
3282 dump_printf_loc (MSG_NOTE, vect_location,
3283 "Found loop_vect sink: %G", stmt_info->stmt);
3284 worklist.safe_push (stmt_info);
3285 return;
3286 }
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE, vect_location,
3289 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3290 STMT_SLP_TYPE (stmt_info) = pure_slp;
3291 }
3292
3293 /* Find stmts that must be both vectorized and SLPed. */
3294
3295 void
3296 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3297 {
3298 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3299
3300 /* All stmts participating in SLP are marked pure_slp, all other
3301 stmts are loop_vect.
3302 First collect all loop_vect stmts into a worklist.
3303 SLP patterns cause not all original scalar stmts to appear in
3304 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3305 Rectify this here and do a backward walk over the IL only considering
3306 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3307 mark them as pure_slp. */
3308 auto_vec<stmt_vec_info> worklist;
3309 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3310 {
3311 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3312 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3313 gsi_next (&gsi))
3314 {
3315 gphi *phi = gsi.phi ();
3316 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3317 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3318 maybe_push_to_hybrid_worklist (loop_vinfo,
3319 worklist, stmt_info);
3320 }
3321 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3322 gsi_prev (&gsi))
3323 {
3324 gimple *stmt = gsi_stmt (gsi);
3325 if (is_gimple_debug (stmt))
3326 continue;
3327 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3328 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3329 {
3330 for (gimple_stmt_iterator gsi2
3331 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3332 !gsi_end_p (gsi2); gsi_next (&gsi2))
3333 {
3334 stmt_vec_info patt_info
3335 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3336 if (!STMT_SLP_TYPE (patt_info)
3337 && STMT_VINFO_RELEVANT (patt_info))
3338 maybe_push_to_hybrid_worklist (loop_vinfo,
3339 worklist, patt_info);
3340 }
3341 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3342 }
3343 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3344 maybe_push_to_hybrid_worklist (loop_vinfo,
3345 worklist, stmt_info);
3346 }
3347 }
3348
3349 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3350 mark any SLP vectorized stmt as hybrid.
3351 ??? We're visiting def stmts N times (once for each non-SLP and
3352 once for each hybrid-SLP use). */
3353 walk_stmt_info wi;
3354 vdhs_data dat;
3355 dat.worklist = &worklist;
3356 dat.loop_vinfo = loop_vinfo;
3357 memset (&wi, 0, sizeof (wi));
3358 wi.info = (void *)&dat;
3359 while (!worklist.is_empty ())
3360 {
3361 stmt_vec_info stmt_info = worklist.pop ();
3362 /* Since SSA operands are not set up for pattern stmts we need
3363 to use walk_gimple_op. */
3364 wi.is_lhs = 0;
3365 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3366 }
3367 }
3368
3369
3370 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3371
3372 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3373 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3374 {
3375 for (unsigned i = 0; i < bbs.length (); ++i)
3376 {
3377 if (i != 0)
3378 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3379 gsi_next (&si))
3380 {
3381 gphi *phi = si.phi ();
3382 gimple_set_uid (phi, 0);
3383 add_stmt (phi);
3384 }
3385 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3386 !gsi_end_p (gsi); gsi_next (&gsi))
3387 {
3388 gimple *stmt = gsi_stmt (gsi);
3389 gimple_set_uid (stmt, 0);
3390 if (is_gimple_debug (stmt))
3391 continue;
3392 add_stmt (stmt);
3393 }
3394 }
3395 }
3396
3397
3398 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3399 stmts in the basic block. */
3400
3401 _bb_vec_info::~_bb_vec_info ()
3402 {
3403 /* Reset region marker. */
3404 for (unsigned i = 0; i < bbs.length (); ++i)
3405 {
3406 if (i != 0)
3407 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3408 gsi_next (&si))
3409 {
3410 gphi *phi = si.phi ();
3411 gimple_set_uid (phi, -1);
3412 }
3413 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3414 !gsi_end_p (gsi); gsi_next (&gsi))
3415 {
3416 gimple *stmt = gsi_stmt (gsi);
3417 gimple_set_uid (stmt, -1);
3418 }
3419 }
3420
3421 for (unsigned i = 0; i < roots.length (); ++i)
3422 roots[i].stmts.release ();
3423 roots.release ();
3424 }
3425
3426 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3427 given then that child nodes have already been processed, and that
3428 their def types currently match their SLP node's def type. */
3429
3430 static bool
3431 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3432 slp_instance node_instance,
3433 stmt_vector_for_cost *cost_vec)
3434 {
3435 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3436 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3437
3438 /* Calculate the number of vector statements to be created for the
3439 scalar stmts in this node. For SLP reductions it is equal to the
3440 number of vector statements in the children (which has already been
3441 calculated by the recursive call). Otherwise it is the number of
3442 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3443 VF divided by the number of elements in a vector. */
3444 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3445 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3446 {
3447 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3448 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3449 {
3450 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3451 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3452 break;
3453 }
3454 }
3455 else
3456 {
3457 poly_uint64 vf;
3458 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3459 vf = loop_vinfo->vectorization_factor;
3460 else
3461 vf = 1;
3462 unsigned int group_size = SLP_TREE_LANES (node);
3463 tree vectype = SLP_TREE_VECTYPE (node);
3464 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3465 = vect_get_num_vectors (vf * group_size, vectype);
3466 }
3467
3468 /* Handle purely internal nodes. */
3469 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3470 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3471
3472 if (is_a <bb_vec_info> (vinfo)
3473 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3474 {
3475 if (dump_enabled_p ())
3476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3477 "desired vector type conflicts with earlier one "
3478 "for %G", stmt_info->stmt);
3479 return false;
3480 }
3481
3482 bool dummy;
3483 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3484 node, node_instance, cost_vec);
3485 }
3486
3487 /* Try to build NODE from scalars, returning true on success.
3488 NODE_INSTANCE is the SLP instance that contains NODE. */
3489
3490 static bool
3491 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3492 slp_instance node_instance)
3493 {
3494 stmt_vec_info stmt_info;
3495 unsigned int i;
3496
3497 if (!is_a <bb_vec_info> (vinfo)
3498 || node == SLP_INSTANCE_TREE (node_instance)
3499 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3500 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3501 return false;
3502
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_NOTE, vect_location,
3505 "Building vector operands of %p from scalars instead\n", node);
3506
3507 /* Don't remove and free the child nodes here, since they could be
3508 referenced by other structures. The analysis and scheduling phases
3509 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3510 unsigned int group_size = SLP_TREE_LANES (node);
3511 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3512 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3513 SLP_TREE_LOAD_PERMUTATION (node).release ();
3514 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3515 {
3516 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3517 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3518 }
3519 return true;
3520 }
3521
3522 /* Compute the prologue cost for invariant or constant operands represented
3523 by NODE. */
3524
3525 static void
3526 vect_prologue_cost_for_slp (slp_tree node,
3527 stmt_vector_for_cost *cost_vec)
3528 {
3529 /* There's a special case of an existing vector, that costs nothing. */
3530 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3531 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3532 return;
3533 /* Without looking at the actual initializer a vector of
3534 constants can be implemented as load from the constant pool.
3535 When all elements are the same we can use a splat. */
3536 tree vectype = SLP_TREE_VECTYPE (node);
3537 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3538 unsigned num_vects_to_check;
3539 unsigned HOST_WIDE_INT const_nunits;
3540 unsigned nelt_limit;
3541 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3542 && ! multiple_p (const_nunits, group_size))
3543 {
3544 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3545 nelt_limit = const_nunits;
3546 }
3547 else
3548 {
3549 /* If either the vector has variable length or the vectors
3550 are composed of repeated whole groups we only need to
3551 cost construction once. All vectors will be the same. */
3552 num_vects_to_check = 1;
3553 nelt_limit = group_size;
3554 }
3555 tree elt = NULL_TREE;
3556 unsigned nelt = 0;
3557 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3558 {
3559 unsigned si = j % group_size;
3560 if (nelt == 0)
3561 elt = SLP_TREE_SCALAR_OPS (node)[si];
3562 /* ??? We're just tracking whether all operands of a single
3563 vector initializer are the same, ideally we'd check if
3564 we emitted the same one already. */
3565 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3566 elt = NULL_TREE;
3567 nelt++;
3568 if (nelt == nelt_limit)
3569 {
3570 record_stmt_cost (cost_vec, 1,
3571 SLP_TREE_DEF_TYPE (node) == vect_external_def
3572 ? (elt ? scalar_to_vec : vec_construct)
3573 : vector_load,
3574 NULL, vectype, 0, vect_prologue);
3575 nelt = 0;
3576 }
3577 }
3578 }
3579
3580 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3581 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3582
3583 Return true if the operations are supported. */
3584
3585 static bool
3586 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3587 slp_instance node_instance,
3588 hash_set<slp_tree> &visited_set,
3589 vec<slp_tree> &visited_vec,
3590 stmt_vector_for_cost *cost_vec)
3591 {
3592 int i, j;
3593 slp_tree child;
3594
3595 /* Assume we can code-generate all invariants. */
3596 if (!node
3597 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3598 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3599 return true;
3600
3601 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3602 {
3603 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_NOTE, vect_location,
3605 "Failed cyclic SLP reference in %p", node);
3606 return false;
3607 }
3608 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3609
3610 /* If we already analyzed the exact same set of scalar stmts we're done.
3611 We share the generated vector stmts for those. */
3612 if (visited_set.add (node))
3613 return true;
3614 visited_vec.safe_push (node);
3615
3616 bool res = true;
3617 unsigned visited_rec_start = visited_vec.length ();
3618 unsigned cost_vec_rec_start = cost_vec->length ();
3619 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3620 {
3621 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3622 visited_set, visited_vec,
3623 cost_vec);
3624 if (!res)
3625 break;
3626 }
3627
3628 if (res)
3629 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3630 cost_vec);
3631 /* If analysis failed we have to pop all recursive visited nodes
3632 plus ourselves. */
3633 if (!res)
3634 {
3635 while (visited_vec.length () >= visited_rec_start)
3636 visited_set.remove (visited_vec.pop ());
3637 cost_vec->truncate (cost_vec_rec_start);
3638 }
3639
3640 /* When the node can be vectorized cost invariant nodes it references.
3641 This is not done in DFS order to allow the refering node
3642 vectorizable_* calls to nail down the invariant nodes vector type
3643 and possibly unshare it if it needs a different vector type than
3644 other referrers. */
3645 if (res)
3646 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3647 if (child
3648 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3649 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3650 /* Perform usual caching, note code-generation still
3651 code-gens these nodes multiple times but we expect
3652 to CSE them later. */
3653 && !visited_set.add (child))
3654 {
3655 visited_vec.safe_push (child);
3656 /* ??? After auditing more code paths make a "default"
3657 and push the vector type from NODE to all children
3658 if it is not already set. */
3659 /* Compute the number of vectors to be generated. */
3660 tree vector_type = SLP_TREE_VECTYPE (child);
3661 if (!vector_type)
3662 {
3663 /* For shifts with a scalar argument we don't need
3664 to cost or code-generate anything.
3665 ??? Represent this more explicitely. */
3666 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3667 == shift_vec_info_type)
3668 && j == 1);
3669 continue;
3670 }
3671 unsigned group_size = SLP_TREE_LANES (child);
3672 poly_uint64 vf = 1;
3673 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3674 vf = loop_vinfo->vectorization_factor;
3675 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3676 = vect_get_num_vectors (vf * group_size, vector_type);
3677 /* And cost them. */
3678 vect_prologue_cost_for_slp (child, cost_vec);
3679 }
3680
3681 /* If this node or any of its children can't be vectorized, try pruning
3682 the tree here rather than felling the whole thing. */
3683 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3684 {
3685 /* We'll need to revisit this for invariant costing and number
3686 of vectorized stmt setting. */
3687 res = true;
3688 }
3689
3690 return res;
3691 }
3692
3693
3694 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3695 region and that can be vectorized using vectorizable_live_operation
3696 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3697 scalar code computing it to be retained. */
3698
3699 static void
3700 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3701 slp_instance instance,
3702 stmt_vector_for_cost *cost_vec,
3703 hash_set<stmt_vec_info> &svisited,
3704 hash_set<slp_tree> &visited)
3705 {
3706 if (visited.add (node))
3707 return;
3708
3709 unsigned i;
3710 stmt_vec_info stmt_info;
3711 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3712 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3713 {
3714 if (svisited.contains (stmt_info))
3715 continue;
3716 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3717 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3718 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3719 /* Only the pattern root stmt computes the original scalar value. */
3720 continue;
3721 bool mark_visited = true;
3722 gimple *orig_stmt = orig_stmt_info->stmt;
3723 ssa_op_iter op_iter;
3724 def_operand_p def_p;
3725 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3726 {
3727 imm_use_iterator use_iter;
3728 gimple *use_stmt;
3729 stmt_vec_info use_stmt_info;
3730 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3731 if (!is_gimple_debug (use_stmt))
3732 {
3733 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3734 if (!use_stmt_info
3735 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3736 {
3737 STMT_VINFO_LIVE_P (stmt_info) = true;
3738 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3739 NULL, node, instance, i,
3740 false, cost_vec))
3741 /* ??? So we know we can vectorize the live stmt
3742 from one SLP node. If we cannot do so from all
3743 or none consistently we'd have to record which
3744 SLP node (and lane) we want to use for the live
3745 operation. So make sure we can code-generate
3746 from all nodes. */
3747 mark_visited = false;
3748 else
3749 STMT_VINFO_LIVE_P (stmt_info) = false;
3750 BREAK_FROM_IMM_USE_STMT (use_iter);
3751 }
3752 }
3753 /* We have to verify whether we can insert the lane extract
3754 before all uses. The following is a conservative approximation.
3755 We cannot put this into vectorizable_live_operation because
3756 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3757 doesn't work.
3758 Note that while the fact that we emit code for loads at the
3759 first load should make this a non-problem leafs we construct
3760 from scalars are vectorized after the last scalar def.
3761 ??? If we'd actually compute the insert location during
3762 analysis we could use sth less conservative than the last
3763 scalar stmt in the node for the dominance check. */
3764 /* ??? What remains is "live" uses in vector CTORs in the same
3765 SLP graph which is where those uses can end up code-generated
3766 right after their definition instead of close to their original
3767 use. But that would restrict us to code-generate lane-extracts
3768 from the latest stmt in a node. So we compensate for this
3769 during code-generation, simply not replacing uses for those
3770 hopefully rare cases. */
3771 if (STMT_VINFO_LIVE_P (stmt_info))
3772 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3773 if (!is_gimple_debug (use_stmt)
3774 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3775 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3776 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3777 {
3778 if (dump_enabled_p ())
3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780 "Cannot determine insertion place for "
3781 "lane extract\n");
3782 STMT_VINFO_LIVE_P (stmt_info) = false;
3783 mark_visited = true;
3784 }
3785 }
3786 if (mark_visited)
3787 svisited.add (stmt_info);
3788 }
3789
3790 slp_tree child;
3791 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3792 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3793 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3794 cost_vec, svisited, visited);
3795 }
3796
3797 /* Analyze statements in SLP instances of VINFO. Return true if the
3798 operations are supported. */
3799
3800 bool
3801 vect_slp_analyze_operations (vec_info *vinfo)
3802 {
3803 slp_instance instance;
3804 int i;
3805
3806 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3807
3808 hash_set<slp_tree> visited;
3809 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3810 {
3811 auto_vec<slp_tree> visited_vec;
3812 stmt_vector_for_cost cost_vec;
3813 cost_vec.create (2);
3814 if (is_a <bb_vec_info> (vinfo))
3815 vect_location = instance->location ();
3816 if (!vect_slp_analyze_node_operations (vinfo,
3817 SLP_INSTANCE_TREE (instance),
3818 instance, visited, visited_vec,
3819 &cost_vec)
3820 /* Instances with a root stmt require vectorized defs for the
3821 SLP tree root. */
3822 || (SLP_INSTANCE_ROOT_STMT (instance)
3823 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3824 != vect_internal_def)))
3825 {
3826 slp_tree node = SLP_INSTANCE_TREE (instance);
3827 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3828 if (dump_enabled_p ())
3829 dump_printf_loc (MSG_NOTE, vect_location,
3830 "removing SLP instance operations starting from: %G",
3831 stmt_info->stmt);
3832 vect_free_slp_instance (instance);
3833 vinfo->slp_instances.ordered_remove (i);
3834 cost_vec.release ();
3835 while (!visited_vec.is_empty ())
3836 visited.remove (visited_vec.pop ());
3837 }
3838 else
3839 {
3840 i++;
3841
3842 /* For BB vectorization remember the SLP graph entry
3843 cost for later. */
3844 if (is_a <bb_vec_info> (vinfo))
3845 instance->cost_vec = cost_vec;
3846 else
3847 {
3848 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3849 cost_vec.release ();
3850 }
3851 }
3852 }
3853
3854 /* Compute vectorizable live stmts. */
3855 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3856 {
3857 hash_set<stmt_vec_info> svisited;
3858 hash_set<slp_tree> visited;
3859 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3860 {
3861 vect_location = instance->location ();
3862 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3863 instance, &instance->cost_vec, svisited,
3864 visited);
3865 }
3866 }
3867
3868 return !vinfo->slp_instances.is_empty ();
3869 }
3870
3871 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3872 closing the eventual chain. */
3873
3874 static slp_instance
3875 get_ultimate_leader (slp_instance instance,
3876 hash_map<slp_instance, slp_instance> &instance_leader)
3877 {
3878 auto_vec<slp_instance *, 8> chain;
3879 slp_instance *tem;
3880 while (*(tem = instance_leader.get (instance)) != instance)
3881 {
3882 chain.safe_push (tem);
3883 instance = *tem;
3884 }
3885 while (!chain.is_empty ())
3886 *chain.pop () = instance;
3887 return instance;
3888 }
3889
3890 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3891
3892 static void
3893 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3894 slp_instance instance, slp_tree node,
3895 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3896 hash_map<slp_instance, slp_instance> &instance_leader,
3897 hash_set<slp_tree> &visited)
3898 {
3899 stmt_vec_info stmt_info;
3900 unsigned i;
3901
3902 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3903 {
3904 bool existed_p;
3905 slp_instance &stmt_instance
3906 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3907 if (!existed_p)
3908 ;
3909 else if (stmt_instance != instance)
3910 {
3911 /* If we're running into a previously marked stmt make us the
3912 leader of the current ultimate leader. This keeps the
3913 leader chain acyclic and works even when the current instance
3914 connects two previously independent graph parts. */
3915 slp_instance stmt_leader
3916 = get_ultimate_leader (stmt_instance, instance_leader);
3917 if (stmt_leader != instance)
3918 instance_leader.put (stmt_leader, instance);
3919 }
3920 stmt_instance = instance;
3921 }
3922
3923 if (visited.add (node))
3924 return;
3925
3926 slp_tree child;
3927 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3928 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3929 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3930 instance_leader, visited);
3931 }
3932
3933 /* Partition the SLP graph into pieces that can be costed independently. */
3934
3935 static void
3936 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3937 {
3938 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3939
3940 /* First walk the SLP graph assigning each involved scalar stmt a
3941 corresponding SLP graph entry and upon visiting a previously
3942 marked stmt, make the stmts leader the current SLP graph entry. */
3943 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3944 hash_map<slp_instance, slp_instance> instance_leader;
3945 hash_set<slp_tree> visited;
3946 slp_instance instance;
3947 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3948 {
3949 instance_leader.put (instance, instance);
3950 vect_bb_partition_graph_r (bb_vinfo,
3951 instance, SLP_INSTANCE_TREE (instance),
3952 stmt_to_instance, instance_leader,
3953 visited);
3954 }
3955
3956 /* Then collect entries to each independent subgraph. */
3957 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3958 {
3959 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3960 leader->subgraph_entries.safe_push (instance);
3961 if (dump_enabled_p ()
3962 && leader != instance)
3963 dump_printf_loc (MSG_NOTE, vect_location,
3964 "instance %p is leader of %p\n",
3965 leader, instance);
3966 }
3967 }
3968
3969 /* Compute the scalar cost of the SLP node NODE and its children
3970 and return it. Do not account defs that are marked in LIFE and
3971 update LIFE according to uses of NODE. */
3972
3973 static void
3974 vect_bb_slp_scalar_cost (vec_info *vinfo,
3975 slp_tree node, vec<bool, va_heap> *life,
3976 stmt_vector_for_cost *cost_vec,
3977 hash_set<slp_tree> &visited)
3978 {
3979 unsigned i;
3980 stmt_vec_info stmt_info;
3981 slp_tree child;
3982
3983 if (visited.add (node))
3984 return;
3985
3986 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3987 {
3988 ssa_op_iter op_iter;
3989 def_operand_p def_p;
3990
3991 if ((*life)[i])
3992 continue;
3993
3994 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3995 gimple *orig_stmt = orig_stmt_info->stmt;
3996
3997 /* If there is a non-vectorized use of the defs then the scalar
3998 stmt is kept live in which case we do not account it or any
3999 required defs in the SLP children in the scalar cost. This
4000 way we make the vectorization more costly when compared to
4001 the scalar cost. */
4002 if (!STMT_VINFO_LIVE_P (stmt_info))
4003 {
4004 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4005 {
4006 imm_use_iterator use_iter;
4007 gimple *use_stmt;
4008 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4009 if (!is_gimple_debug (use_stmt))
4010 {
4011 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4012 if (!use_stmt_info
4013 || !PURE_SLP_STMT
4014 (vect_stmt_to_vectorize (use_stmt_info)))
4015 {
4016 (*life)[i] = true;
4017 BREAK_FROM_IMM_USE_STMT (use_iter);
4018 }
4019 }
4020 }
4021 if ((*life)[i])
4022 continue;
4023 }
4024
4025 /* Count scalar stmts only once. */
4026 if (gimple_visited_p (orig_stmt))
4027 continue;
4028 gimple_set_visited (orig_stmt, true);
4029
4030 vect_cost_for_stmt kind;
4031 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4032 {
4033 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4034 kind = scalar_load;
4035 else
4036 kind = scalar_store;
4037 }
4038 else if (vect_nop_conversion_p (orig_stmt_info))
4039 continue;
4040 else
4041 kind = scalar_stmt;
4042 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4043 SLP_TREE_VECTYPE (node), 0, vect_body);
4044 }
4045
4046 auto_vec<bool, 20> subtree_life;
4047 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4048 {
4049 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4050 {
4051 /* Do not directly pass LIFE to the recursive call, copy it to
4052 confine changes in the callee to the current child/subtree. */
4053 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4054 {
4055 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4056 for (unsigned j = 0;
4057 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4058 {
4059 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4060 if (perm.first == i)
4061 subtree_life[perm.second] = (*life)[j];
4062 }
4063 }
4064 else
4065 {
4066 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4067 subtree_life.safe_splice (*life);
4068 }
4069 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4070 visited);
4071 subtree_life.truncate (0);
4072 }
4073 }
4074 }
4075
4076 /* Check if vectorization of the basic block is profitable for the
4077 subgraph denoted by SLP_INSTANCES. */
4078
4079 static bool
4080 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4081 vec<slp_instance> slp_instances)
4082 {
4083 slp_instance instance;
4084 int i;
4085 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4086 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4087
4088 void *vect_target_cost_data = init_cost (NULL);
4089
4090 /* Calculate scalar cost and sum the cost for the vector stmts
4091 previously collected. */
4092 stmt_vector_for_cost scalar_costs;
4093 scalar_costs.create (0);
4094 hash_set<slp_tree> visited;
4095 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4096 {
4097 auto_vec<bool, 20> life;
4098 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4099 true);
4100 vect_bb_slp_scalar_cost (bb_vinfo,
4101 SLP_INSTANCE_TREE (instance),
4102 &life, &scalar_costs, visited);
4103 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4104 instance->cost_vec.release ();
4105 }
4106 /* Unset visited flag. */
4107 stmt_info_for_cost *si;
4108 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4109 gimple_set_visited (si->stmt_info->stmt, false);
4110
4111 void *scalar_target_cost_data = init_cost (NULL);
4112 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4113 scalar_costs.release ();
4114 unsigned dummy;
4115 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4116 destroy_cost_data (scalar_target_cost_data);
4117
4118 /* Complete the target-specific vector cost calculation. */
4119 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4120 &vec_inside_cost, &vec_epilogue_cost);
4121 destroy_cost_data (vect_target_cost_data);
4122
4123 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4124
4125 if (dump_enabled_p ())
4126 {
4127 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4128 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4129 vec_inside_cost);
4130 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4131 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4132 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4133 }
4134
4135 /* Vectorization is profitable if its cost is more than the cost of scalar
4136 version. Note that we err on the vector side for equal cost because
4137 the cost estimate is otherwise quite pessimistic (constant uses are
4138 free on the scalar side but cost a load on the vector side for
4139 example). */
4140 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4141 return false;
4142
4143 return true;
4144 }
4145
4146 /* qsort comparator for lane defs. */
4147
4148 static int
4149 vld_cmp (const void *a_, const void *b_)
4150 {
4151 auto *a = (const std::pair<unsigned, tree> *)a_;
4152 auto *b = (const std::pair<unsigned, tree> *)b_;
4153 return a->first - b->first;
4154 }
4155
4156 /* Return true if USE_STMT is a vector lane insert into VEC and set
4157 *THIS_LANE to the lane number that is set. */
4158
4159 static bool
4160 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4161 {
4162 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4163 if (!use_ass
4164 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4165 || (vec
4166 ? gimple_assign_rhs1 (use_ass) != vec
4167 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4168 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4169 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4170 || !constant_multiple_p
4171 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4172 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4173 this_lane))
4174 return false;
4175 return true;
4176 }
4177
4178 /* Find any vectorizable constructors and add them to the grouped_store
4179 array. */
4180
4181 static void
4182 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4183 {
4184 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4185 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4186 !gsi_end_p (gsi); gsi_next (&gsi))
4187 {
4188 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4189 if (!assign)
4190 continue;
4191
4192 tree rhs = gimple_assign_rhs1 (assign);
4193 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4194 {
4195 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4196 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4197 CONSTRUCTOR_NELTS (rhs))
4198 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4199 || uniform_vector_p (rhs))
4200 continue;
4201
4202 unsigned j;
4203 tree val;
4204 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4205 if (TREE_CODE (val) != SSA_NAME
4206 || !bb_vinfo->lookup_def (val))
4207 break;
4208 if (j != CONSTRUCTOR_NELTS (rhs))
4209 continue;
4210
4211 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4212 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4213 }
4214 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4215 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4216 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4217 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4218 && integer_zerop (gimple_assign_rhs3 (assign))
4219 && useless_type_conversion_p
4220 (TREE_TYPE (TREE_TYPE (rhs)),
4221 TREE_TYPE (gimple_assign_rhs2 (assign)))
4222 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4223 {
4224 /* We start to match on insert to lane zero but since the
4225 inserts need not be ordered we'd have to search both
4226 the def and the use chains. */
4227 tree vectype = TREE_TYPE (rhs);
4228 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4229 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4230 auto_sbitmap lanes (nlanes);
4231 bitmap_clear (lanes);
4232 bitmap_set_bit (lanes, 0);
4233 tree def = gimple_assign_lhs (assign);
4234 lane_defs.quick_push
4235 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4236 unsigned lanes_found = 1;
4237 /* Start with the use chains, the last stmt will be the root. */
4238 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4239 do
4240 {
4241 use_operand_p use_p;
4242 gimple *use_stmt;
4243 if (!single_imm_use (def, &use_p, &use_stmt))
4244 break;
4245 unsigned this_lane;
4246 if (!bb_vinfo->lookup_stmt (use_stmt)
4247 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4248 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4249 break;
4250 if (bitmap_bit_p (lanes, this_lane))
4251 break;
4252 lanes_found++;
4253 bitmap_set_bit (lanes, this_lane);
4254 gassign *use_ass = as_a <gassign *> (use_stmt);
4255 lane_defs.quick_push (std::make_pair
4256 (this_lane, gimple_assign_rhs2 (use_ass)));
4257 last = bb_vinfo->lookup_stmt (use_ass);
4258 def = gimple_assign_lhs (use_ass);
4259 }
4260 while (lanes_found < nlanes);
4261 if (lanes_found < nlanes)
4262 {
4263 /* Now search the def chain. */
4264 def = gimple_assign_rhs1 (assign);
4265 do
4266 {
4267 if (TREE_CODE (def) != SSA_NAME
4268 || !has_single_use (def))
4269 break;
4270 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4271 unsigned this_lane;
4272 if (!bb_vinfo->lookup_stmt (def_stmt)
4273 || !vect_slp_is_lane_insert (def_stmt,
4274 NULL_TREE, &this_lane)
4275 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4276 break;
4277 if (bitmap_bit_p (lanes, this_lane))
4278 break;
4279 lanes_found++;
4280 bitmap_set_bit (lanes, this_lane);
4281 lane_defs.quick_push (std::make_pair
4282 (this_lane,
4283 gimple_assign_rhs2 (def_stmt)));
4284 def = gimple_assign_rhs1 (def_stmt);
4285 }
4286 while (lanes_found < nlanes);
4287 }
4288 if (lanes_found == nlanes)
4289 {
4290 /* Sort lane_defs after the lane index and register the root. */
4291 lane_defs.qsort (vld_cmp);
4292 vec<stmt_vec_info> stmts;
4293 stmts.create (nlanes);
4294 for (unsigned i = 0; i < nlanes; ++i)
4295 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4296 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4297 stmts, last));
4298 }
4299 }
4300 }
4301 }
4302
4303 /* Walk the grouped store chains and replace entries with their
4304 pattern variant if any. */
4305
4306 static void
4307 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4308 {
4309 stmt_vec_info first_element;
4310 unsigned i;
4311
4312 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4313 {
4314 /* We also have CTORs in this array. */
4315 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4316 continue;
4317 if (STMT_VINFO_IN_PATTERN_P (first_element))
4318 {
4319 stmt_vec_info orig = first_element;
4320 first_element = STMT_VINFO_RELATED_STMT (first_element);
4321 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4322 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4323 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4324 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4325 vinfo->grouped_stores[i] = first_element;
4326 }
4327 stmt_vec_info prev = first_element;
4328 while (DR_GROUP_NEXT_ELEMENT (prev))
4329 {
4330 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4331 if (STMT_VINFO_IN_PATTERN_P (elt))
4332 {
4333 stmt_vec_info orig = elt;
4334 elt = STMT_VINFO_RELATED_STMT (elt);
4335 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4336 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4337 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4338 }
4339 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4340 prev = elt;
4341 }
4342 }
4343 }
4344
4345 /* Check if the region described by BB_VINFO can be vectorized, returning
4346 true if so. When returning false, set FATAL to true if the same failure
4347 would prevent vectorization at other vector sizes, false if it is still
4348 worth trying other sizes. N_STMTS is the number of statements in the
4349 region. */
4350
4351 static bool
4352 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4353 vec<int> *dataref_groups)
4354 {
4355 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4356
4357 slp_instance instance;
4358 int i;
4359 poly_uint64 min_vf = 2;
4360
4361 /* The first group of checks is independent of the vector size. */
4362 fatal = true;
4363
4364 /* Analyze the data references. */
4365
4366 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4367 {
4368 if (dump_enabled_p ())
4369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4370 "not vectorized: unhandled data-ref in basic "
4371 "block.\n");
4372 return false;
4373 }
4374
4375 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4376 {
4377 if (dump_enabled_p ())
4378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379 "not vectorized: unhandled data access in "
4380 "basic block.\n");
4381 return false;
4382 }
4383
4384 vect_slp_check_for_constructors (bb_vinfo);
4385
4386 /* If there are no grouped stores and no constructors in the region
4387 there is no need to continue with pattern recog as vect_analyze_slp
4388 will fail anyway. */
4389 if (bb_vinfo->grouped_stores.is_empty ()
4390 && bb_vinfo->roots.is_empty ())
4391 {
4392 if (dump_enabled_p ())
4393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4394 "not vectorized: no grouped stores in "
4395 "basic block.\n");
4396 return false;
4397 }
4398
4399 /* While the rest of the analysis below depends on it in some way. */
4400 fatal = false;
4401
4402 vect_pattern_recog (bb_vinfo);
4403
4404 /* Update store groups from pattern processing. */
4405 vect_fixup_store_groups_with_patterns (bb_vinfo);
4406
4407 /* Check the SLP opportunities in the basic block, analyze and build SLP
4408 trees. */
4409 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4410 {
4411 if (dump_enabled_p ())
4412 {
4413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4414 "Failed to SLP the basic block.\n");
4415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4416 "not vectorized: failed to find SLP opportunities "
4417 "in basic block.\n");
4418 }
4419 return false;
4420 }
4421
4422 /* Optimize permutations. */
4423 vect_optimize_slp (bb_vinfo);
4424
4425 /* Gather the loads reachable from the SLP graph entries. */
4426 vect_gather_slp_loads (bb_vinfo);
4427
4428 vect_record_base_alignments (bb_vinfo);
4429
4430 /* Analyze and verify the alignment of data references and the
4431 dependence in the SLP instances. */
4432 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4433 {
4434 vect_location = instance->location ();
4435 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4436 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4437 {
4438 slp_tree node = SLP_INSTANCE_TREE (instance);
4439 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4440 if (dump_enabled_p ())
4441 dump_printf_loc (MSG_NOTE, vect_location,
4442 "removing SLP instance operations starting from: %G",
4443 stmt_info->stmt);
4444 vect_free_slp_instance (instance);
4445 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4446 continue;
4447 }
4448
4449 /* Mark all the statements that we want to vectorize as pure SLP and
4450 relevant. */
4451 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4452 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4453 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4454 {
4455 STMT_SLP_TYPE (root) = pure_slp;
4456 if (is_gimple_assign (root->stmt)
4457 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4458 {
4459 /* ??? We should probably record the whole vector of
4460 root stmts so we do not have to back-track here... */
4461 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4462 n != 1; --n)
4463 {
4464 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4465 STMT_SLP_TYPE (root) = pure_slp;
4466 }
4467 }
4468 }
4469
4470 i++;
4471 }
4472 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4473 return false;
4474
4475 if (!vect_slp_analyze_operations (bb_vinfo))
4476 {
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4479 "not vectorized: bad operation in basic block.\n");
4480 return false;
4481 }
4482
4483 vect_bb_partition_graph (bb_vinfo);
4484
4485 return true;
4486 }
4487
4488 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4489 basic blocks in BBS, returning true on success.
4490 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4491
4492 static bool
4493 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4494 vec<int> *dataref_groups, unsigned int n_stmts)
4495 {
4496 bb_vec_info bb_vinfo;
4497 auto_vector_modes vector_modes;
4498
4499 /* Autodetect first vector size we try. */
4500 machine_mode next_vector_mode = VOIDmode;
4501 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4502 unsigned int mode_i = 0;
4503
4504 vec_info_shared shared;
4505
4506 machine_mode autodetected_vector_mode = VOIDmode;
4507 while (1)
4508 {
4509 bool vectorized = false;
4510 bool fatal = false;
4511 bb_vinfo = new _bb_vec_info (bbs, &shared);
4512
4513 bool first_time_p = shared.datarefs.is_empty ();
4514 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4515 if (first_time_p)
4516 bb_vinfo->shared->save_datarefs ();
4517 else
4518 bb_vinfo->shared->check_datarefs ();
4519 bb_vinfo->vector_mode = next_vector_mode;
4520
4521 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4522 && dbg_cnt (vect_slp))
4523 {
4524 if (dump_enabled_p ())
4525 {
4526 dump_printf_loc (MSG_NOTE, vect_location,
4527 "***** Analysis succeeded with vector mode"
4528 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4529 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4530 }
4531
4532 bb_vinfo->shared->check_datarefs ();
4533
4534 unsigned i;
4535 slp_instance instance;
4536 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4537 {
4538 if (instance->subgraph_entries.is_empty ())
4539 continue;
4540
4541 vect_location = instance->location ();
4542 if (!unlimited_cost_model (NULL)
4543 && !vect_bb_vectorization_profitable_p
4544 (bb_vinfo, instance->subgraph_entries))
4545 {
4546 if (dump_enabled_p ())
4547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4548 "not vectorized: vectorization is not "
4549 "profitable.\n");
4550 continue;
4551 }
4552
4553 if (!vectorized && dump_enabled_p ())
4554 dump_printf_loc (MSG_NOTE, vect_location,
4555 "Basic block will be vectorized "
4556 "using SLP\n");
4557 vectorized = true;
4558
4559 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4560
4561 unsigned HOST_WIDE_INT bytes;
4562 if (dump_enabled_p ())
4563 {
4564 if (GET_MODE_SIZE
4565 (bb_vinfo->vector_mode).is_constant (&bytes))
4566 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4567 "basic block part vectorized using %wu "
4568 "byte vectors\n", bytes);
4569 else
4570 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4571 "basic block part vectorized using "
4572 "variable length vectors\n");
4573 }
4574 }
4575 }
4576 else
4577 {
4578 if (dump_enabled_p ())
4579 dump_printf_loc (MSG_NOTE, vect_location,
4580 "***** Analysis failed with vector mode %s\n",
4581 GET_MODE_NAME (bb_vinfo->vector_mode));
4582 }
4583
4584 if (mode_i == 0)
4585 autodetected_vector_mode = bb_vinfo->vector_mode;
4586
4587 if (!fatal)
4588 while (mode_i < vector_modes.length ()
4589 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4590 {
4591 if (dump_enabled_p ())
4592 dump_printf_loc (MSG_NOTE, vect_location,
4593 "***** The result for vector mode %s would"
4594 " be the same\n",
4595 GET_MODE_NAME (vector_modes[mode_i]));
4596 mode_i += 1;
4597 }
4598
4599 delete bb_vinfo;
4600
4601 if (mode_i < vector_modes.length ()
4602 && VECTOR_MODE_P (autodetected_vector_mode)
4603 && (related_vector_mode (vector_modes[mode_i],
4604 GET_MODE_INNER (autodetected_vector_mode))
4605 == autodetected_vector_mode)
4606 && (related_vector_mode (autodetected_vector_mode,
4607 GET_MODE_INNER (vector_modes[mode_i]))
4608 == vector_modes[mode_i]))
4609 {
4610 if (dump_enabled_p ())
4611 dump_printf_loc (MSG_NOTE, vect_location,
4612 "***** Skipping vector mode %s, which would"
4613 " repeat the analysis for %s\n",
4614 GET_MODE_NAME (vector_modes[mode_i]),
4615 GET_MODE_NAME (autodetected_vector_mode));
4616 mode_i += 1;
4617 }
4618
4619 if (vectorized
4620 || mode_i == vector_modes.length ()
4621 || autodetected_vector_mode == VOIDmode
4622 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4623 vector sizes will fail do not bother iterating. */
4624 || fatal)
4625 return vectorized;
4626
4627 /* Try the next biggest vector size. */
4628 next_vector_mode = vector_modes[mode_i++];
4629 if (dump_enabled_p ())
4630 dump_printf_loc (MSG_NOTE, vect_location,
4631 "***** Re-trying analysis with vector mode %s\n",
4632 GET_MODE_NAME (next_vector_mode));
4633 }
4634 }
4635
4636
4637 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4638 true if anything in the basic-block was vectorized. */
4639
4640 static bool
4641 vect_slp_bbs (vec<basic_block> bbs)
4642 {
4643 vec<data_reference_p> datarefs = vNULL;
4644 auto_vec<int> dataref_groups;
4645 int insns = 0;
4646 int current_group = 0;
4647
4648 for (unsigned i = 0; i < bbs.length (); i++)
4649 {
4650 basic_block bb = bbs[i];
4651 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4652 gsi_next (&gsi))
4653 {
4654 gimple *stmt = gsi_stmt (gsi);
4655 if (is_gimple_debug (stmt))
4656 continue;
4657
4658 insns++;
4659
4660 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4661 vect_location = stmt;
4662
4663 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4664 &dataref_groups, current_group))
4665 ++current_group;
4666 }
4667 }
4668
4669 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4670 }
4671
4672 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4673 true if anything in the basic-block was vectorized. */
4674
4675 bool
4676 vect_slp_bb (basic_block bb)
4677 {
4678 auto_vec<basic_block> bbs;
4679 bbs.safe_push (bb);
4680 return vect_slp_bbs (bbs);
4681 }
4682
4683 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4684 true if anything in the basic-block was vectorized. */
4685
4686 bool
4687 vect_slp_function (function *fun)
4688 {
4689 bool r = false;
4690 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4691 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4692
4693 /* For the moment split the function into pieces to avoid making
4694 the iteration on the vector mode moot. Split at points we know
4695 to not handle well which is CFG merges (SLP discovery doesn't
4696 handle non-loop-header PHIs) and loop exits. Since pattern
4697 recog requires reverse iteration to visit uses before defs
4698 simply chop RPO into pieces. */
4699 auto_vec<basic_block> bbs;
4700 for (unsigned i = 0; i < n; i++)
4701 {
4702 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4703 bool split = false;
4704
4705 /* Split when a BB is not dominated by the first block. */
4706 if (!bbs.is_empty ()
4707 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4708 {
4709 if (dump_enabled_p ())
4710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4711 "splitting region at dominance boundary bb%d\n",
4712 bb->index);
4713 split = true;
4714 }
4715 /* Split when the loop determined by the first block
4716 is exited. This is because we eventually insert
4717 invariants at region begin. */
4718 else if (!bbs.is_empty ()
4719 && bbs[0]->loop_father != bb->loop_father
4720 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4721 {
4722 if (dump_enabled_p ())
4723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4724 "splitting region at loop %d exit at bb%d\n",
4725 bbs[0]->loop_father->num, bb->index);
4726 split = true;
4727 }
4728
4729 if (split && !bbs.is_empty ())
4730 {
4731 r |= vect_slp_bbs (bbs);
4732 bbs.truncate (0);
4733 bbs.quick_push (bb);
4734 }
4735 else
4736 bbs.safe_push (bb);
4737
4738 /* When we have a stmt ending this block and defining a
4739 value we have to insert on edges when inserting after it for
4740 a vector containing its definition. Avoid this for now. */
4741 if (gimple *last = last_stmt (bb))
4742 if (gimple_get_lhs (last)
4743 && is_ctrl_altering_stmt (last))
4744 {
4745 if (dump_enabled_p ())
4746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4747 "splitting region at control altering "
4748 "definition %G", last);
4749 r |= vect_slp_bbs (bbs);
4750 bbs.truncate (0);
4751 }
4752 }
4753
4754 if (!bbs.is_empty ())
4755 r |= vect_slp_bbs (bbs);
4756
4757 free (rpo);
4758
4759 return r;
4760 }
4761
4762 /* Build a variable-length vector in which the elements in ELTS are repeated
4763 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4764 RESULTS and add any new instructions to SEQ.
4765
4766 The approach we use is:
4767
4768 (1) Find a vector mode VM with integer elements of mode IM.
4769
4770 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4771 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4772 from small vectors to IM.
4773
4774 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4775
4776 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4777 correct byte contents.
4778
4779 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4780
4781 We try to find the largest IM for which this sequence works, in order
4782 to cut down on the number of interleaves. */
4783
4784 void
4785 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4786 vec<tree> elts, unsigned int nresults,
4787 vec<tree> &results)
4788 {
4789 unsigned int nelts = elts.length ();
4790 tree element_type = TREE_TYPE (vector_type);
4791
4792 /* (1) Find a vector mode VM with integer elements of mode IM. */
4793 unsigned int nvectors = 1;
4794 tree new_vector_type;
4795 tree permutes[2];
4796 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4797 &nvectors, &new_vector_type,
4798 permutes))
4799 gcc_unreachable ();
4800
4801 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4802 unsigned int partial_nelts = nelts / nvectors;
4803 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4804
4805 tree_vector_builder partial_elts;
4806 auto_vec<tree, 32> pieces (nvectors * 2);
4807 pieces.quick_grow (nvectors * 2);
4808 for (unsigned int i = 0; i < nvectors; ++i)
4809 {
4810 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4811 ELTS' has mode IM. */
4812 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4813 for (unsigned int j = 0; j < partial_nelts; ++j)
4814 partial_elts.quick_push (elts[i * partial_nelts + j]);
4815 tree t = gimple_build_vector (seq, &partial_elts);
4816 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4817 TREE_TYPE (new_vector_type), t);
4818
4819 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4820 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4821 }
4822
4823 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4824 correct byte contents.
4825
4826 We need to repeat the following operation log2(nvectors) times:
4827
4828 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4829 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4830
4831 However, if each input repeats every N elements and the VF is
4832 a multiple of N * 2, the HI result is the same as the LO. */
4833 unsigned int in_start = 0;
4834 unsigned int out_start = nvectors;
4835 unsigned int hi_start = nvectors / 2;
4836 /* A bound on the number of outputs needed to produce NRESULTS results
4837 in the final iteration. */
4838 unsigned int noutputs_bound = nvectors * nresults;
4839 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4840 {
4841 noutputs_bound /= 2;
4842 unsigned int limit = MIN (noutputs_bound, nvectors);
4843 for (unsigned int i = 0; i < limit; ++i)
4844 {
4845 if ((i & 1) != 0
4846 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4847 2 * in_repeat))
4848 {
4849 pieces[out_start + i] = pieces[out_start + i - 1];
4850 continue;
4851 }
4852
4853 tree output = make_ssa_name (new_vector_type);
4854 tree input1 = pieces[in_start + (i / 2)];
4855 tree input2 = pieces[in_start + (i / 2) + hi_start];
4856 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4857 input1, input2,
4858 permutes[i & 1]);
4859 gimple_seq_add_stmt (seq, stmt);
4860 pieces[out_start + i] = output;
4861 }
4862 std::swap (in_start, out_start);
4863 }
4864
4865 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4866 results.reserve (nresults);
4867 for (unsigned int i = 0; i < nresults; ++i)
4868 if (i < nvectors)
4869 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4870 pieces[in_start + i]));
4871 else
4872 results.quick_push (results[i - nvectors]);
4873 }
4874
4875
4876 /* For constant and loop invariant defs in OP_NODE this function creates
4877 vector defs that will be used in the vectorized stmts and stores them
4878 to SLP_TREE_VEC_DEFS of OP_NODE. */
4879
4880 static void
4881 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4882 {
4883 unsigned HOST_WIDE_INT nunits;
4884 tree vec_cst;
4885 unsigned j, number_of_places_left_in_vector;
4886 tree vector_type;
4887 tree vop;
4888 int group_size = op_node->ops.length ();
4889 unsigned int vec_num, i;
4890 unsigned number_of_copies = 1;
4891 bool constant_p;
4892 gimple_seq ctor_seq = NULL;
4893 auto_vec<tree, 16> permute_results;
4894
4895 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4896 vector_type = SLP_TREE_VECTYPE (op_node);
4897
4898 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4899 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4900 auto_vec<tree> voprnds (number_of_vectors);
4901
4902 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4903 created vectors. It is greater than 1 if unrolling is performed.
4904
4905 For example, we have two scalar operands, s1 and s2 (e.g., group of
4906 strided accesses of size two), while NUNITS is four (i.e., four scalars
4907 of this type can be packed in a vector). The output vector will contain
4908 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4909 will be 2).
4910
4911 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4912 containing the operands.
4913
4914 For example, NUNITS is four as before, and the group size is 8
4915 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4916 {s5, s6, s7, s8}. */
4917
4918 /* When using duplicate_and_interleave, we just need one element for
4919 each scalar statement. */
4920 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4921 nunits = group_size;
4922
4923 number_of_copies = nunits * number_of_vectors / group_size;
4924
4925 number_of_places_left_in_vector = nunits;
4926 constant_p = true;
4927 tree_vector_builder elts (vector_type, nunits, 1);
4928 elts.quick_grow (nunits);
4929 stmt_vec_info insert_after = NULL;
4930 for (j = 0; j < number_of_copies; j++)
4931 {
4932 tree op;
4933 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4934 {
4935 /* Create 'vect_ = {op0,op1,...,opn}'. */
4936 number_of_places_left_in_vector--;
4937 tree orig_op = op;
4938 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4939 {
4940 if (CONSTANT_CLASS_P (op))
4941 {
4942 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4943 {
4944 /* Can't use VIEW_CONVERT_EXPR for booleans because
4945 of possibly different sizes of scalar value and
4946 vector element. */
4947 if (integer_zerop (op))
4948 op = build_int_cst (TREE_TYPE (vector_type), 0);
4949 else if (integer_onep (op))
4950 op = build_all_ones_cst (TREE_TYPE (vector_type));
4951 else
4952 gcc_unreachable ();
4953 }
4954 else
4955 op = fold_unary (VIEW_CONVERT_EXPR,
4956 TREE_TYPE (vector_type), op);
4957 gcc_assert (op && CONSTANT_CLASS_P (op));
4958 }
4959 else
4960 {
4961 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4962 gimple *init_stmt;
4963 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4964 {
4965 tree true_val
4966 = build_all_ones_cst (TREE_TYPE (vector_type));
4967 tree false_val
4968 = build_zero_cst (TREE_TYPE (vector_type));
4969 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4970 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4971 op, true_val,
4972 false_val);
4973 }
4974 else
4975 {
4976 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4977 op);
4978 init_stmt
4979 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4980 op);
4981 }
4982 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4983 op = new_temp;
4984 }
4985 }
4986 elts[number_of_places_left_in_vector] = op;
4987 if (!CONSTANT_CLASS_P (op))
4988 constant_p = false;
4989 /* For BB vectorization we have to compute an insert location
4990 when a def is inside the analyzed region since we cannot
4991 simply insert at the BB start in this case. */
4992 stmt_vec_info opdef;
4993 if (TREE_CODE (orig_op) == SSA_NAME
4994 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4995 && is_a <bb_vec_info> (vinfo)
4996 && (opdef = vinfo->lookup_def (orig_op)))
4997 {
4998 if (!insert_after)
4999 insert_after = opdef;
5000 else
5001 insert_after = get_later_stmt (insert_after, opdef);
5002 }
5003
5004 if (number_of_places_left_in_vector == 0)
5005 {
5006 if (constant_p
5007 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5008 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5009 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5010 else
5011 {
5012 if (permute_results.is_empty ())
5013 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5014 elts, number_of_vectors,
5015 permute_results);
5016 vec_cst = permute_results[number_of_vectors - j - 1];
5017 }
5018 if (!gimple_seq_empty_p (ctor_seq))
5019 {
5020 if (insert_after)
5021 {
5022 gimple_stmt_iterator gsi;
5023 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5024 {
5025 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5026 gsi_insert_seq_before (&gsi, ctor_seq,
5027 GSI_CONTINUE_LINKING);
5028 }
5029 else if (!stmt_ends_bb_p (insert_after->stmt))
5030 {
5031 gsi = gsi_for_stmt (insert_after->stmt);
5032 gsi_insert_seq_after (&gsi, ctor_seq,
5033 GSI_CONTINUE_LINKING);
5034 }
5035 else
5036 {
5037 /* When we want to insert after a def where the
5038 defining stmt throws then insert on the fallthru
5039 edge. */
5040 edge e = find_fallthru_edge
5041 (gimple_bb (insert_after->stmt)->succs);
5042 basic_block new_bb
5043 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5044 gcc_assert (!new_bb);
5045 }
5046 }
5047 else
5048 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5049 ctor_seq = NULL;
5050 }
5051 voprnds.quick_push (vec_cst);
5052 insert_after = NULL;
5053 number_of_places_left_in_vector = nunits;
5054 constant_p = true;
5055 elts.new_vector (vector_type, nunits, 1);
5056 elts.quick_grow (nunits);
5057 }
5058 }
5059 }
5060
5061 /* Since the vectors are created in the reverse order, we should invert
5062 them. */
5063 vec_num = voprnds.length ();
5064 for (j = vec_num; j != 0; j--)
5065 {
5066 vop = voprnds[j - 1];
5067 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5068 }
5069
5070 /* In case that VF is greater than the unrolling factor needed for the SLP
5071 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5072 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5073 to replicate the vectors. */
5074 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5075 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5076 i++)
5077 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5078 }
5079
5080 /* Get the Ith vectorized definition from SLP_NODE. */
5081
5082 tree
5083 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5084 {
5085 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5086 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5087 else
5088 return SLP_TREE_VEC_DEFS (slp_node)[i];
5089 }
5090
5091 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5092
5093 void
5094 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5095 {
5096 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5097 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5098 {
5099 unsigned j;
5100 gimple *vec_def_stmt;
5101 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5102 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5103 }
5104 else
5105 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5106 }
5107
5108 /* Get N vectorized definitions for SLP_NODE. */
5109
5110 void
5111 vect_get_slp_defs (vec_info *,
5112 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5113 {
5114 if (n == -1U)
5115 n = SLP_TREE_CHILDREN (slp_node).length ();
5116
5117 for (unsigned i = 0; i < n; ++i)
5118 {
5119 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5120 vec<tree> vec_defs = vNULL;
5121 vect_get_slp_defs (child, &vec_defs);
5122 vec_oprnds->quick_push (vec_defs);
5123 }
5124 }
5125
5126 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5127 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5128 permute statements for the SLP node NODE. Store the number of vector
5129 permute instructions in *N_PERMS and the number of vector load
5130 instructions in *N_LOADS. */
5131
5132 bool
5133 vect_transform_slp_perm_load (vec_info *vinfo,
5134 slp_tree node, vec<tree> dr_chain,
5135 gimple_stmt_iterator *gsi, poly_uint64 vf,
5136 bool analyze_only, unsigned *n_perms,
5137 unsigned int *n_loads)
5138 {
5139 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5140 int vec_index = 0;
5141 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5142 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5143 unsigned int mask_element;
5144 machine_mode mode;
5145
5146 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5147 return false;
5148
5149 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5150
5151 mode = TYPE_MODE (vectype);
5152 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5153
5154 /* Initialize the vect stmts of NODE to properly insert the generated
5155 stmts later. */
5156 if (! analyze_only)
5157 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5158 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5159 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5160
5161 /* Generate permutation masks for every NODE. Number of masks for each NODE
5162 is equal to GROUP_SIZE.
5163 E.g., we have a group of three nodes with three loads from the same
5164 location in each node, and the vector size is 4. I.e., we have a
5165 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5166 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5167 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5168 ...
5169
5170 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5171 The last mask is illegal since we assume two operands for permute
5172 operation, and the mask element values can't be outside that range.
5173 Hence, the last mask must be converted into {2,5,5,5}.
5174 For the first two permutations we need the first and the second input
5175 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5176 we need the second and the third vectors: {b1,c1,a2,b2} and
5177 {c2,a3,b3,c3}. */
5178
5179 int vect_stmts_counter = 0;
5180 unsigned int index = 0;
5181 int first_vec_index = -1;
5182 int second_vec_index = -1;
5183 bool noop_p = true;
5184 *n_perms = 0;
5185
5186 vec_perm_builder mask;
5187 unsigned int nelts_to_build;
5188 unsigned int nvectors_per_build;
5189 unsigned int in_nlanes;
5190 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5191 && multiple_p (nunits, group_size));
5192 if (repeating_p)
5193 {
5194 /* A single vector contains a whole number of copies of the node, so:
5195 (a) all permutes can use the same mask; and
5196 (b) the permutes only need a single vector input. */
5197 mask.new_vector (nunits, group_size, 3);
5198 nelts_to_build = mask.encoded_nelts ();
5199 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5200 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5201 }
5202 else
5203 {
5204 /* We need to construct a separate mask for each vector statement. */
5205 unsigned HOST_WIDE_INT const_nunits, const_vf;
5206 if (!nunits.is_constant (&const_nunits)
5207 || !vf.is_constant (&const_vf))
5208 return false;
5209 mask.new_vector (const_nunits, const_nunits, 1);
5210 nelts_to_build = const_vf * group_size;
5211 nvectors_per_build = 1;
5212 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5213 }
5214 auto_sbitmap used_in_lanes (in_nlanes);
5215 bitmap_clear (used_in_lanes);
5216
5217 unsigned int count = mask.encoded_nelts ();
5218 mask.quick_grow (count);
5219 vec_perm_indices indices;
5220
5221 for (unsigned int j = 0; j < nelts_to_build; j++)
5222 {
5223 unsigned int iter_num = j / group_size;
5224 unsigned int stmt_num = j % group_size;
5225 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5226 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5227 bitmap_set_bit (used_in_lanes, i);
5228 if (repeating_p)
5229 {
5230 first_vec_index = 0;
5231 mask_element = i;
5232 }
5233 else
5234 {
5235 /* Enforced before the loop when !repeating_p. */
5236 unsigned int const_nunits = nunits.to_constant ();
5237 vec_index = i / const_nunits;
5238 mask_element = i % const_nunits;
5239 if (vec_index == first_vec_index
5240 || first_vec_index == -1)
5241 {
5242 first_vec_index = vec_index;
5243 }
5244 else if (vec_index == second_vec_index
5245 || second_vec_index == -1)
5246 {
5247 second_vec_index = vec_index;
5248 mask_element += const_nunits;
5249 }
5250 else
5251 {
5252 if (dump_enabled_p ())
5253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5254 "permutation requires at "
5255 "least three vectors %G",
5256 stmt_info->stmt);
5257 gcc_assert (analyze_only);
5258 return false;
5259 }
5260
5261 gcc_assert (mask_element < 2 * const_nunits);
5262 }
5263
5264 if (mask_element != index)
5265 noop_p = false;
5266 mask[index++] = mask_element;
5267
5268 if (index == count && !noop_p)
5269 {
5270 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5271 if (!can_vec_perm_const_p (mode, indices))
5272 {
5273 if (dump_enabled_p ())
5274 {
5275 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5276 vect_location,
5277 "unsupported vect permute { ");
5278 for (i = 0; i < count; ++i)
5279 {
5280 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5281 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5282 }
5283 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5284 }
5285 gcc_assert (analyze_only);
5286 return false;
5287 }
5288
5289 ++*n_perms;
5290 }
5291
5292 if (index == count)
5293 {
5294 if (!analyze_only)
5295 {
5296 tree mask_vec = NULL_TREE;
5297
5298 if (! noop_p)
5299 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5300
5301 if (second_vec_index == -1)
5302 second_vec_index = first_vec_index;
5303
5304 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5305 {
5306 /* Generate the permute statement if necessary. */
5307 tree first_vec = dr_chain[first_vec_index + ri];
5308 tree second_vec = dr_chain[second_vec_index + ri];
5309 gimple *perm_stmt;
5310 if (! noop_p)
5311 {
5312 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5313 tree perm_dest
5314 = vect_create_destination_var (gimple_assign_lhs (stmt),
5315 vectype);
5316 perm_dest = make_ssa_name (perm_dest);
5317 perm_stmt
5318 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5319 first_vec, second_vec,
5320 mask_vec);
5321 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5322 gsi);
5323 }
5324 else
5325 /* If mask was NULL_TREE generate the requested
5326 identity transform. */
5327 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5328
5329 /* Store the vector statement in NODE. */
5330 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5331 }
5332 }
5333
5334 index = 0;
5335 first_vec_index = -1;
5336 second_vec_index = -1;
5337 noop_p = true;
5338 }
5339 }
5340
5341 if (n_loads)
5342 {
5343 if (repeating_p)
5344 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5345 else
5346 {
5347 /* Enforced above when !repeating_p. */
5348 unsigned int const_nunits = nunits.to_constant ();
5349 *n_loads = 0;
5350 bool load_seen = false;
5351 for (unsigned i = 0; i < in_nlanes; ++i)
5352 {
5353 if (i % const_nunits == 0)
5354 {
5355 if (load_seen)
5356 *n_loads += 1;
5357 load_seen = false;
5358 }
5359 if (bitmap_bit_p (used_in_lanes, i))
5360 load_seen = true;
5361 }
5362 if (load_seen)
5363 *n_loads += 1;
5364 }
5365 }
5366
5367 return true;
5368 }
5369
5370
5371 /* Vectorize the SLP permutations in NODE as specified
5372 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5373 child number and lane number.
5374 Interleaving of two two-lane two-child SLP subtrees (not supported):
5375 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5376 A blend of two four-lane two-child SLP subtrees:
5377 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5378 Highpart of a four-lane one-child SLP subtree (not supported):
5379 [ { 0, 2 }, { 0, 3 } ]
5380 Where currently only a subset is supported by code generating below. */
5381
5382 static bool
5383 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5384 slp_tree node, stmt_vector_for_cost *cost_vec)
5385 {
5386 tree vectype = SLP_TREE_VECTYPE (node);
5387
5388 /* ??? We currently only support all same vector input and output types
5389 while the SLP IL should really do a concat + select and thus accept
5390 arbitrary mismatches. */
5391 slp_tree child;
5392 unsigned i;
5393 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5394 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5395 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5396 {
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 "Unsupported lane permutation\n");
5400 return false;
5401 }
5402
5403 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5404 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5405 if (dump_enabled_p ())
5406 {
5407 dump_printf_loc (MSG_NOTE, vect_location,
5408 "vectorizing permutation");
5409 for (unsigned i = 0; i < perm.length (); ++i)
5410 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5411 dump_printf (MSG_NOTE, "\n");
5412 }
5413
5414 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5415 if (!nunits.is_constant ())
5416 return false;
5417 unsigned HOST_WIDE_INT vf = 1;
5418 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5419 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5420 return false;
5421 unsigned olanes = vf * SLP_TREE_LANES (node);
5422 gcc_assert (multiple_p (olanes, nunits));
5423
5424 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5425 from the { SLP operand, scalar lane } permutation as recorded in the
5426 SLP node as intermediate step. This part should already work
5427 with SLP children with arbitrary number of lanes. */
5428 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5429 auto_vec<unsigned> active_lane;
5430 vperm.create (olanes);
5431 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5432 for (unsigned i = 0; i < vf; ++i)
5433 {
5434 for (unsigned pi = 0; pi < perm.length (); ++pi)
5435 {
5436 std::pair<unsigned, unsigned> p = perm[pi];
5437 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5438 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5439 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5440 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5441 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5442 }
5443 /* Advance to the next group. */
5444 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5445 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5446 }
5447
5448 if (dump_enabled_p ())
5449 {
5450 dump_printf_loc (MSG_NOTE, vect_location, "as");
5451 for (unsigned i = 0; i < vperm.length (); ++i)
5452 {
5453 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5454 dump_printf (MSG_NOTE, ",");
5455 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5456 vperm[i].first.first, vperm[i].first.second,
5457 vperm[i].first.second);
5458 }
5459 dump_printf (MSG_NOTE, "\n");
5460 }
5461
5462 /* We can only handle two-vector permutes, everything else should
5463 be lowered on the SLP level. The following is closely inspired
5464 by vect_transform_slp_perm_load and is supposed to eventually
5465 replace it.
5466 ??? As intermediate step do code-gen in the SLP tree representation
5467 somehow? */
5468 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5469 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5470 unsigned int const_nunits = nunits.to_constant ();
5471 unsigned int index = 0;
5472 unsigned int mask_element;
5473 vec_perm_builder mask;
5474 mask.new_vector (const_nunits, const_nunits, 1);
5475 unsigned int count = mask.encoded_nelts ();
5476 mask.quick_grow (count);
5477 vec_perm_indices indices;
5478 unsigned nperms = 0;
5479 for (unsigned i = 0; i < vperm.length (); ++i)
5480 {
5481 mask_element = vperm[i].second;
5482 if (first_vec.first == -1U
5483 || first_vec == vperm[i].first)
5484 first_vec = vperm[i].first;
5485 else if (second_vec.first == -1U
5486 || second_vec == vperm[i].first)
5487 {
5488 second_vec = vperm[i].first;
5489 mask_element += const_nunits;
5490 }
5491 else
5492 {
5493 if (dump_enabled_p ())
5494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5495 "permutation requires at "
5496 "least three vectors\n");
5497 gcc_assert (!gsi);
5498 return false;
5499 }
5500
5501 mask[index++] = mask_element;
5502
5503 if (index == count)
5504 {
5505 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5506 const_nunits);
5507 bool identity_p = indices.series_p (0, 1, 0, 1);
5508 if (!identity_p
5509 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5510 {
5511 if (dump_enabled_p ())
5512 {
5513 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5514 vect_location,
5515 "unsupported vect permute { ");
5516 for (i = 0; i < count; ++i)
5517 {
5518 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5519 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5520 }
5521 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5522 }
5523 gcc_assert (!gsi);
5524 return false;
5525 }
5526
5527 if (!identity_p)
5528 nperms++;
5529 if (gsi)
5530 {
5531 if (second_vec.first == -1U)
5532 second_vec = first_vec;
5533
5534 /* Generate the permute statement if necessary. */
5535 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5536 tree first_def
5537 = vect_get_slp_vect_def (first_node, first_vec.second);
5538 gassign *perm_stmt;
5539 tree perm_dest = make_ssa_name (vectype);
5540 if (!identity_p)
5541 {
5542 slp_tree second_node
5543 = SLP_TREE_CHILDREN (node)[second_vec.first];
5544 tree second_def
5545 = vect_get_slp_vect_def (second_node, second_vec.second);
5546 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5547 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5548 first_def, second_def,
5549 mask_vec);
5550 }
5551 else
5552 /* We need a copy here in case the def was external. */
5553 perm_stmt = gimple_build_assign (perm_dest, first_def);
5554 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5555 /* Store the vector statement in NODE. */
5556 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5557 }
5558
5559 index = 0;
5560 first_vec = std::make_pair (-1U, -1U);
5561 second_vec = std::make_pair (-1U, -1U);
5562 }
5563 }
5564
5565 if (!gsi)
5566 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5567
5568 return true;
5569 }
5570
5571 /* Vectorize SLP NODE. */
5572
5573 static void
5574 vect_schedule_slp_node (vec_info *vinfo,
5575 slp_tree node, slp_instance instance)
5576 {
5577 gimple_stmt_iterator si;
5578 int i;
5579 slp_tree child;
5580
5581 /* For existing vectors there's nothing to do. */
5582 if (SLP_TREE_VEC_DEFS (node).exists ())
5583 return;
5584
5585 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5586
5587 /* Vectorize externals and constants. */
5588 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5589 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5590 {
5591 /* ??? vectorizable_shift can end up using a scalar operand which is
5592 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5593 node in this case. */
5594 if (!SLP_TREE_VECTYPE (node))
5595 return;
5596
5597 vect_create_constant_vectors (vinfo, node);
5598 return;
5599 }
5600
5601 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5602
5603 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5604 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5605
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_NOTE, vect_location,
5608 "------>vectorizing SLP node starting from: %G",
5609 stmt_info->stmt);
5610
5611 if (STMT_VINFO_DATA_REF (stmt_info)
5612 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5613 {
5614 /* Vectorized loads go before the first scalar load to make it
5615 ready early, vectorized stores go before the last scalar
5616 stmt which is where all uses are ready. */
5617 stmt_vec_info last_stmt_info = NULL;
5618 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5619 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5620 else /* DR_IS_WRITE */
5621 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5622 si = gsi_for_stmt (last_stmt_info->stmt);
5623 }
5624 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5625 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5626 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5627 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5628 {
5629 /* For PHI node vectorization we do not use the insertion iterator. */
5630 si = gsi_none ();
5631 }
5632 else
5633 {
5634 /* Emit other stmts after the children vectorized defs which is
5635 earliest possible. */
5636 gimple *last_stmt = NULL;
5637 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5638 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5639 {
5640 /* For fold-left reductions we are retaining the scalar
5641 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5642 set so the representation isn't perfect. Resort to the
5643 last scalar def here. */
5644 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5645 {
5646 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5647 == cycle_phi_info_type);
5648 gphi *phi = as_a <gphi *>
5649 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5650 if (!last_stmt
5651 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5652 last_stmt = phi;
5653 }
5654 /* We are emitting all vectorized stmts in the same place and
5655 the last one is the last.
5656 ??? Unless we have a load permutation applied and that
5657 figures to re-use an earlier generated load. */
5658 unsigned j;
5659 gimple *vstmt;
5660 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5661 if (!last_stmt
5662 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5663 last_stmt = vstmt;
5664 }
5665 else if (!SLP_TREE_VECTYPE (child))
5666 {
5667 /* For externals we use unvectorized at all scalar defs. */
5668 unsigned j;
5669 tree def;
5670 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5671 if (TREE_CODE (def) == SSA_NAME
5672 && !SSA_NAME_IS_DEFAULT_DEF (def))
5673 {
5674 gimple *stmt = SSA_NAME_DEF_STMT (def);
5675 if (!last_stmt
5676 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5677 last_stmt = stmt;
5678 }
5679 }
5680 else
5681 {
5682 /* For externals we have to look at all defs since their
5683 insertion place is decided per vector. But beware
5684 of pre-existing vectors where we need to make sure
5685 we do not insert before the region boundary. */
5686 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5687 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5688 last_stmt = gsi_stmt (gsi_after_labels
5689 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5690 else
5691 {
5692 unsigned j;
5693 tree vdef;
5694 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5695 if (TREE_CODE (vdef) == SSA_NAME
5696 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5697 {
5698 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5699 if (!last_stmt
5700 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5701 last_stmt = vstmt;
5702 }
5703 }
5704 }
5705 /* This can happen when all children are pre-existing vectors or
5706 constants. */
5707 if (!last_stmt)
5708 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5709 if (is_a <gphi *> (last_stmt))
5710 si = gsi_after_labels (gimple_bb (last_stmt));
5711 else
5712 {
5713 si = gsi_for_stmt (last_stmt);
5714 gsi_next (&si);
5715 }
5716 }
5717
5718 bool done_p = false;
5719
5720 /* Handle purely internal nodes. */
5721 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5722 {
5723 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5724 be shared with different SLP nodes (but usually it's the same
5725 operation apart from the case the stmt is only there for denoting
5726 the actual scalar lane defs ...). So do not call vect_transform_stmt
5727 but open-code it here (partly). */
5728 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5729 gcc_assert (done);
5730 done_p = true;
5731 }
5732 if (!done_p)
5733 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5734 }
5735
5736 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5737 For loop vectorization this is done in vectorizable_call, but for SLP
5738 it needs to be deferred until end of vect_schedule_slp, because multiple
5739 SLP instances may refer to the same scalar stmt. */
5740
5741 static void
5742 vect_remove_slp_scalar_calls (vec_info *vinfo,
5743 slp_tree node, hash_set<slp_tree> &visited)
5744 {
5745 gimple *new_stmt;
5746 gimple_stmt_iterator gsi;
5747 int i;
5748 slp_tree child;
5749 tree lhs;
5750 stmt_vec_info stmt_info;
5751
5752 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5753 return;
5754
5755 if (visited.add (node))
5756 return;
5757
5758 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5759 vect_remove_slp_scalar_calls (vinfo, child, visited);
5760
5761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5762 {
5763 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5764 if (!stmt || gimple_bb (stmt) == NULL)
5765 continue;
5766 if (is_pattern_stmt_p (stmt_info)
5767 || !PURE_SLP_STMT (stmt_info))
5768 continue;
5769 lhs = gimple_call_lhs (stmt);
5770 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5771 gsi = gsi_for_stmt (stmt);
5772 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5773 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5774 }
5775 }
5776
5777 static void
5778 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5779 {
5780 hash_set<slp_tree> visited;
5781 vect_remove_slp_scalar_calls (vinfo, node, visited);
5782 }
5783
5784 /* Vectorize the instance root. */
5785
5786 void
5787 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5788 {
5789 gassign *rstmt = NULL;
5790
5791 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5792 {
5793 gimple *child_stmt;
5794 int j;
5795
5796 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5797 {
5798 tree vect_lhs = gimple_get_lhs (child_stmt);
5799 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5800 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5801 TREE_TYPE (vect_lhs)))
5802 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5803 vect_lhs);
5804 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5805 break;
5806 }
5807 }
5808 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5809 {
5810 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5811 gimple *child_stmt;
5812 int j;
5813 vec<constructor_elt, va_gc> *v;
5814 vec_alloc (v, nelts);
5815
5816 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5817 {
5818 CONSTRUCTOR_APPEND_ELT (v,
5819 NULL_TREE,
5820 gimple_get_lhs (child_stmt));
5821 }
5822 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5823 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5824 tree r_constructor = build_constructor (rtype, v);
5825 rstmt = gimple_build_assign (lhs, r_constructor);
5826 }
5827
5828 gcc_assert (rstmt);
5829
5830 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5831 gsi_replace (&rgsi, rstmt, true);
5832 }
5833
5834 struct slp_scc_info
5835 {
5836 bool on_stack;
5837 int dfs;
5838 int lowlink;
5839 };
5840
5841 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5842
5843 static void
5844 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5845 hash_map<slp_tree, slp_scc_info> &scc_info,
5846 int &maxdfs, vec<slp_tree> &stack)
5847 {
5848 bool existed_p;
5849 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5850 gcc_assert (!existed_p);
5851 info->dfs = maxdfs;
5852 info->lowlink = maxdfs;
5853 maxdfs++;
5854
5855 /* Leaf. */
5856 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5857 {
5858 info->on_stack = false;
5859 vect_schedule_slp_node (vinfo, node, instance);
5860 return;
5861 }
5862
5863 info->on_stack = true;
5864 stack.safe_push (node);
5865
5866 unsigned i;
5867 slp_tree child;
5868 /* DFS recurse. */
5869 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5870 {
5871 if (!child)
5872 continue;
5873 slp_scc_info *child_info = scc_info.get (child);
5874 if (!child_info)
5875 {
5876 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5877 /* Recursion might have re-allocated the node. */
5878 info = scc_info.get (node);
5879 child_info = scc_info.get (child);
5880 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5881 }
5882 else if (child_info->on_stack)
5883 info->lowlink = MIN (info->lowlink, child_info->dfs);
5884 }
5885 if (info->lowlink != info->dfs)
5886 return;
5887
5888 auto_vec<slp_tree, 4> phis_to_fixup;
5889
5890 /* Singleton. */
5891 if (stack.last () == node)
5892 {
5893 stack.pop ();
5894 info->on_stack = false;
5895 vect_schedule_slp_node (vinfo, node, instance);
5896 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5897 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5898 phis_to_fixup.quick_push (node);
5899 }
5900 else
5901 {
5902 /* SCC. */
5903 int last_idx = stack.length () - 1;
5904 while (stack[last_idx] != node)
5905 last_idx--;
5906 /* We can break the cycle at PHIs who have at least one child
5907 code generated. Then we could re-start the DFS walk until
5908 all nodes in the SCC are covered (we might have new entries
5909 for only back-reachable nodes). But it's simpler to just
5910 iterate and schedule those that are ready. */
5911 unsigned todo = stack.length () - last_idx;
5912 do
5913 {
5914 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5915 {
5916 slp_tree entry = stack[idx];
5917 if (!entry)
5918 continue;
5919 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5920 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5921 bool ready = !phi;
5922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5923 if (!child)
5924 {
5925 gcc_assert (phi);
5926 ready = true;
5927 break;
5928 }
5929 else if (scc_info.get (child)->on_stack)
5930 {
5931 if (!phi)
5932 {
5933 ready = false;
5934 break;
5935 }
5936 }
5937 else
5938 {
5939 if (phi)
5940 {
5941 ready = true;
5942 break;
5943 }
5944 }
5945 if (ready)
5946 {
5947 vect_schedule_slp_node (vinfo, entry, instance);
5948 scc_info.get (entry)->on_stack = false;
5949 stack[idx] = NULL;
5950 todo--;
5951 if (phi)
5952 phis_to_fixup.safe_push (entry);
5953 }
5954 }
5955 }
5956 while (todo != 0);
5957
5958 /* Pop the SCC. */
5959 stack.truncate (last_idx);
5960 }
5961
5962 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5963 slp_tree phi_node;
5964 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5965 {
5966 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5967 edge_iterator ei;
5968 edge e;
5969 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5970 {
5971 unsigned dest_idx = e->dest_idx;
5972 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5973 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5974 continue;
5975 /* Simply fill all args. */
5976 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5977 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5978 vect_get_slp_vect_def (child, i),
5979 e, gimple_phi_arg_location (phi, dest_idx));
5980 }
5981 }
5982 }
5983
5984 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5985
5986 void
5987 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5988 {
5989 slp_instance instance;
5990 unsigned int i;
5991
5992 hash_map<slp_tree, slp_scc_info> scc_info;
5993 int maxdfs = 0;
5994 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5995 {
5996 slp_tree node = SLP_INSTANCE_TREE (instance);
5997 if (dump_enabled_p ())
5998 {
5999 dump_printf_loc (MSG_NOTE, vect_location,
6000 "Vectorizing SLP tree:\n");
6001 if (SLP_INSTANCE_ROOT_STMT (instance))
6002 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6003 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6004 vect_print_slp_graph (MSG_NOTE, vect_location,
6005 SLP_INSTANCE_TREE (instance));
6006 }
6007 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6008 have a PHI be the node breaking the cycle. */
6009 auto_vec<slp_tree> stack;
6010 if (!scc_info.get (node))
6011 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6012
6013 if (SLP_INSTANCE_ROOT_STMT (instance))
6014 vectorize_slp_instance_root_stmt (node, instance);
6015
6016 if (dump_enabled_p ())
6017 dump_printf_loc (MSG_NOTE, vect_location,
6018 "vectorizing stmts using SLP.\n");
6019 }
6020
6021 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6022 {
6023 slp_tree root = SLP_INSTANCE_TREE (instance);
6024 stmt_vec_info store_info;
6025 unsigned int j;
6026
6027 /* Remove scalar call stmts. Do not do this for basic-block
6028 vectorization as not all uses may be vectorized.
6029 ??? Why should this be necessary? DCE should be able to
6030 remove the stmts itself.
6031 ??? For BB vectorization we can as well remove scalar
6032 stmts starting from the SLP tree root if they have no
6033 uses. */
6034 if (is_a <loop_vec_info> (vinfo))
6035 vect_remove_slp_scalar_calls (vinfo, root);
6036
6037 /* Remove vectorized stores original scalar stmts. */
6038 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6039 {
6040 if (!STMT_VINFO_DATA_REF (store_info)
6041 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6042 break;
6043
6044 store_info = vect_orig_stmt (store_info);
6045 /* Free the attached stmt_vec_info and remove the stmt. */
6046 vinfo->remove_stmt (store_info);
6047 }
6048 }
6049 }