middle-end: Support complex Addition
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 slp_tree
180 vect_create_new_slp_node (unsigned nops, tree_code code)
181 {
182 slp_tree node = new _slp_tree;
183 SLP_TREE_SCALAR_STMTS (node) = vNULL;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_CODE (node) = code;
187 return node;
188 }
189 /* Create an SLP node for SCALAR_STMTS. */
190
191 static slp_tree
192 vect_create_new_slp_node (slp_tree node,
193 vec<stmt_vec_info> scalar_stmts, unsigned nops)
194 {
195 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
196 SLP_TREE_CHILDREN (node).create (nops);
197 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
198 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
199 SLP_TREE_LANES (node) = scalar_stmts.length ();
200 return node;
201 }
202
203 /* Create an SLP node for SCALAR_STMTS. */
204
205 static slp_tree
206 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
207 {
208 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
209 }
210
211 /* Create an SLP node for OPS. */
212
213 static slp_tree
214 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
215 {
216 SLP_TREE_SCALAR_OPS (node) = ops;
217 SLP_TREE_DEF_TYPE (node) = vect_external_def;
218 SLP_TREE_LANES (node) = ops.length ();
219 return node;
220 }
221
222 /* Create an SLP node for OPS. */
223
224 static slp_tree
225 vect_create_new_slp_node (vec<tree> ops)
226 {
227 return vect_create_new_slp_node (new _slp_tree, ops);
228 }
229
230
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
233 node. */
234 typedef struct _slp_oprnd_info
235 {
236 /* Def-stmts for the operands. */
237 vec<stmt_vec_info> def_stmts;
238 /* Operands. */
239 vec<tree> ops;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
242 stmt. */
243 tree first_op_type;
244 enum vect_def_type first_dt;
245 bool any_pattern;
246 } *slp_oprnd_info;
247
248
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
250 operand. */
251 static vec<slp_oprnd_info>
252 vect_create_oprnd_info (int nops, int group_size)
253 {
254 int i;
255 slp_oprnd_info oprnd_info;
256 vec<slp_oprnd_info> oprnds_info;
257
258 oprnds_info.create (nops);
259 for (i = 0; i < nops; i++)
260 {
261 oprnd_info = XNEW (struct _slp_oprnd_info);
262 oprnd_info->def_stmts.create (group_size);
263 oprnd_info->ops.create (group_size);
264 oprnd_info->first_dt = vect_uninitialized_def;
265 oprnd_info->first_op_type = NULL_TREE;
266 oprnd_info->any_pattern = false;
267 oprnds_info.quick_push (oprnd_info);
268 }
269
270 return oprnds_info;
271 }
272
273
274 /* Free operands info. */
275
276 static void
277 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
278 {
279 int i;
280 slp_oprnd_info oprnd_info;
281
282 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
283 {
284 oprnd_info->def_stmts.release ();
285 oprnd_info->ops.release ();
286 XDELETE (oprnd_info);
287 }
288
289 oprnds_info.release ();
290 }
291
292
293 /* Return true if STMTS contains a pattern statement. */
294
295 static bool
296 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
297 {
298 stmt_vec_info stmt_info;
299 unsigned int i;
300 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
301 if (is_pattern_stmt_p (stmt_info))
302 return true;
303 return false;
304 }
305
306 /* Return true when all lanes in the external or constant NODE have
307 the same value. */
308
309 static bool
310 vect_slp_tree_uniform_p (slp_tree node)
311 {
312 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
314
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
317 return false;
318
319 unsigned i;
320 tree op, first = NULL_TREE;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
322 if (!first)
323 first = op;
324 else if (!operand_equal_p (first, op, 0))
325 return false;
326
327 return true;
328 }
329
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
332 of the chain. */
333
334 int
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
336 stmt_vec_info first_stmt_info)
337 {
338 stmt_vec_info next_stmt_info = first_stmt_info;
339 int result = 0;
340
341 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
342 return -1;
343
344 do
345 {
346 if (next_stmt_info == stmt_info)
347 return result;
348 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
349 if (next_stmt_info)
350 result += DR_GROUP_GAP (next_stmt_info);
351 }
352 while (next_stmt_info);
353
354 return -1;
355 }
356
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
361 (if nonnull). */
362
363 bool
364 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
365 tree elt_type, unsigned int *nvectors_out,
366 tree *vector_type_out,
367 tree *permutes)
368 {
369 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
370 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
371 return false;
372
373 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
374 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
375 unsigned int nvectors = 1;
376 for (;;)
377 {
378 scalar_int_mode int_mode;
379 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
380 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
381 {
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type = build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode), 1);
385 tree vector_type
386 = get_vectype_for_scalar_type (vinfo, int_type, count);
387 if (vector_type
388 && VECTOR_MODE_P (TYPE_MODE (vector_type))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
390 GET_MODE_SIZE (base_vector_mode)))
391 {
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
396 vec_perm_builder sel1 (nelts, 2, 3);
397 vec_perm_builder sel2 (nelts, 2, 3);
398 poly_int64 half_nelts = exact_div (nelts, 2);
399 for (unsigned int i = 0; i < 3; ++i)
400 {
401 sel1.quick_push (i);
402 sel1.quick_push (i + nelts);
403 sel2.quick_push (half_nelts + i);
404 sel2.quick_push (half_nelts + i + nelts);
405 }
406 vec_perm_indices indices1 (sel1, 2, nelts);
407 vec_perm_indices indices2 (sel2, 2, nelts);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
410 {
411 if (nvectors_out)
412 *nvectors_out = nvectors;
413 if (vector_type_out)
414 *vector_type_out = vector_type;
415 if (permutes)
416 {
417 permutes[0] = vect_gen_perm_mask_checked (vector_type,
418 indices1);
419 permutes[1] = vect_gen_perm_mask_checked (vector_type,
420 indices2);
421 }
422 return true;
423 }
424 }
425 }
426 if (!multiple_p (elt_bytes, 2, &elt_bytes))
427 return false;
428 nvectors *= 2;
429 }
430 }
431
432 /* Return true if DTA and DTB match. */
433
434 static bool
435 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
436 {
437 return (dta == dtb
438 || ((dta == vect_external_def || dta == vect_constant_def)
439 && (dtb == vect_external_def || dtb == vect_constant_def)));
440 }
441
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
450 value.
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
453 ok return 0. */
454 static int
455 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
456 bool *skip_args,
457 vec<stmt_vec_info> stmts, unsigned stmt_num,
458 vec<slp_oprnd_info> *oprnds_info)
459 {
460 stmt_vec_info stmt_info = stmts[stmt_num];
461 tree oprnd;
462 unsigned int i, number_of_oprnds;
463 enum vect_def_type dt = vect_uninitialized_def;
464 slp_oprnd_info oprnd_info;
465 int first_op_idx = 1;
466 unsigned int commutative_op = -1U;
467 bool first_op_cond = false;
468 bool first = stmt_num == 0;
469
470 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
471 {
472 number_of_oprnds = gimple_call_num_args (stmt);
473 first_op_idx = 3;
474 if (gimple_call_internal_p (stmt))
475 {
476 internal_fn ifn = gimple_call_internal_fn (stmt);
477 commutative_op = first_commutative_argument (ifn);
478
479 /* Masked load, only look at mask. */
480 if (ifn == IFN_MASK_LOAD)
481 {
482 number_of_oprnds = 1;
483 /* Mask operand index. */
484 first_op_idx = 5;
485 }
486 }
487 }
488 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
489 {
490 enum tree_code code = gimple_assign_rhs_code (stmt);
491 number_of_oprnds = gimple_num_ops (stmt) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code == COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
496 {
497 first_op_cond = true;
498 number_of_oprnds++;
499 }
500 else
501 commutative_op = commutative_tree_code (code) ? 0U : -1U;
502 }
503 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
504 number_of_oprnds = gimple_phi_num_args (stmt);
505 else
506 return -1;
507
508 bool swapped = (swap != 0);
509 bool backedge = false;
510 gcc_assert (!swapped || first_op_cond);
511 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
512 for (i = 0; i < number_of_oprnds; i++)
513 {
514 if (first_op_cond)
515 {
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map = maps[swap];
519
520 if (i < 2)
521 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
522 first_op_idx), map[i]);
523 else
524 oprnd = gimple_op (stmt_info->stmt, map[i]);
525 }
526 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
527 {
528 oprnd = gimple_phi_arg_def (stmt, i);
529 backedge = dominated_by_p (CDI_DOMINATORS,
530 gimple_phi_arg_edge (stmt, i)->src,
531 gimple_bb (stmt_info->stmt));
532 }
533 else
534 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
535 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
536 oprnd = TREE_OPERAND (oprnd, 0);
537
538 oprnd_info = (*oprnds_info)[i];
539
540 stmt_vec_info def_stmt_info;
541 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
542 {
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: can't analyze def for %T\n",
546 oprnd);
547
548 return -1;
549 }
550
551 if (skip_args[i])
552 {
553 oprnd_info->def_stmts.quick_push (NULL);
554 oprnd_info->ops.quick_push (NULL_TREE);
555 oprnd_info->first_dt = vect_uninitialized_def;
556 continue;
557 }
558
559 oprnd_info->def_stmts.quick_push (def_stmt_info);
560 oprnd_info->ops.quick_push (oprnd);
561
562 if (def_stmt_info
563 && is_pattern_stmt_p (def_stmt_info))
564 {
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
566 != def_stmt_info)
567 oprnd_info->any_pattern = true;
568 else
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info->ops.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
572 }
573
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
578 goals there. */
579 if (backedge
580 && dts[i] == vect_external_def
581 && is_a <bb_vec_info> (vinfo)
582 && TREE_CODE (oprnd) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
584 && !dominated_by_p (CDI_DOMINATORS,
585 as_a <bb_vec_info> (vinfo)->bbs[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd);
592 return -1;
593 }
594
595 if (first)
596 {
597 tree type = TREE_TYPE (oprnd);
598 dt = dts[i];
599 if ((dt == vect_constant_def
600 || dt == vect_external_def)
601 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
602 && (TREE_CODE (type) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
604 type)))
605 {
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd);
610 return -1;
611 }
612
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
617 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
618 && def_stmt_info)
619 dts[i] = dt = vect_reduction_def;
620
621 /* Check the types of the definition. */
622 switch (dt)
623 {
624 case vect_external_def:
625 case vect_constant_def:
626 case vect_internal_def:
627 case vect_reduction_def:
628 case vect_induction_def:
629 case vect_nested_cycle:
630 break;
631
632 default:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
636 "Build SLP failed: illegal type of def %T\n",
637 oprnd);
638 return -1;
639 }
640
641 oprnd_info->first_dt = dt;
642 oprnd_info->first_op_type = type;
643 }
644 }
645 if (first)
646 return 0;
647
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i = 0; i < number_of_oprnds;)
650 {
651 if (skip_args[i])
652 {
653 ++i;
654 continue;
655 }
656
657 oprnd_info = (*oprnds_info)[i];
658 dt = dts[i];
659 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
660 oprnd = oprnd_info->ops[stmt_num];
661 tree type = TREE_TYPE (oprnd);
662
663 if (!types_compatible_p (oprnd_info->first_op_type, type))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: different operand types\n");
668 return 1;
669 }
670
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
677 && !(oprnd_info->first_dt == vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
680 && def_stmt_info
681 && !STMT_VINFO_DATA_REF (def_stmt_info)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
684 || (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && ((!def_stmt_info
687 || STMT_VINFO_DATA_REF (def_stmt_info)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
690 != (oprnd_info->first_dt != vect_reduction_def))))
691 {
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i == commutative_op && !swapped
695 && (!is_a <bb_vec_info> (vinfo)
696 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
697 dts[i+1])
698 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
699 || vect_def_types_match
700 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE, vect_location,
704 "trying swapped operands\n");
705 std::swap (dts[i], dts[i+1]);
706 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
707 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
708 std::swap ((*oprnds_info)[i]->ops[stmt_num],
709 (*oprnds_info)[i+1]->ops[stmt_num]);
710 swapped = true;
711 continue;
712 }
713
714 if (is_a <bb_vec_info> (vinfo)
715 && !oprnd_info->any_pattern)
716 {
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "treating operand as external\n");
722 oprnd_info->first_dt = dt = vect_external_def;
723 }
724 else
725 {
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "Build SLP failed: different types\n");
729 return 1;
730 }
731 }
732
733 /* Make sure to demote the overall operand to external. */
734 if (dt == vect_external_def)
735 oprnd_info->first_dt = vect_external_def;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt == vect_internal_def
740 || dt == vect_reduction_def)
741 && oprnd_info->first_dt == vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
744 && !STMT_VINFO_DATA_REF (def_stmt_info)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
747 {
748 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
749 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
750 }
751
752 ++i;
753 }
754
755 /* Swap operands. */
756 if (swapped)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "swapped operands to match def types in %G",
761 stmt_info->stmt);
762 }
763
764 return 0;
765 }
766
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
770
771 bool
772 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
773 {
774 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
775 if (old_vectype)
776 return useless_type_conversion_p (vectype, old_vectype);
777
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
779 {
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
783 stmt_vec_info member_info = first_info;
784 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
785 if (is_pattern_stmt_p (member_info)
786 && !useless_type_conversion_p (vectype,
787 STMT_VINFO_VECTYPE (member_info)))
788 break;
789
790 if (!member_info)
791 {
792 for (member_info = first_info; member_info;
793 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
794 STMT_VINFO_VECTYPE (member_info) = vectype;
795 return true;
796 }
797 }
798 else if (!is_pattern_stmt_p (stmt_info))
799 {
800 STMT_VINFO_VECTYPE (stmt_info) = vectype;
801 return true;
802 }
803
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info->stmt);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " old vector type: %T\n", old_vectype);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " new vector type: %T\n", vectype);
813 }
814 return false;
815 }
816
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
819
820 static bool
821 compatible_calls_p (gcall *call1, gcall *call2)
822 {
823 unsigned int nargs = gimple_call_num_args (call1);
824 if (nargs != gimple_call_num_args (call2))
825 return false;
826
827 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
828 return false;
829
830 if (gimple_call_internal_p (call1))
831 {
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
833 TREE_TYPE (gimple_call_lhs (call2))))
834 return false;
835 for (unsigned int i = 0; i < nargs; ++i)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
837 TREE_TYPE (gimple_call_arg (call2, i))))
838 return false;
839 }
840 else
841 {
842 if (!operand_equal_p (gimple_call_fn (call1),
843 gimple_call_fn (call2), 0))
844 return false;
845
846 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
847 return false;
848 }
849 return true;
850 }
851
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
858
859 static bool
860 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
861 unsigned int group_size,
862 tree vectype, poly_uint64 *max_nunits)
863 {
864 if (!vectype)
865 {
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: unsupported data-type in %G\n",
869 stmt_info->stmt);
870 /* Fatal mismatch. */
871 return false;
872 }
873
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
877 unsigned HOST_WIDE_INT const_nunits;
878 if (is_a <bb_vec_info> (vinfo)
879 && (!nunits.is_constant (&const_nunits)
880 || const_nunits > group_size))
881 {
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
884 "Build SLP failed: unrolling required "
885 "in basic block SLP\n");
886 /* Fatal mismatch. */
887 return false;
888 }
889
890 /* In case of multiple types we need to detect the smallest type. */
891 vect_update_max_nunits (max_nunits, vectype);
892 return true;
893 }
894
895 /* Verify if the scalar stmts STMTS are isomorphic, require data
896 permutation or are of unsupported types of operation. Return
897 true if they are, otherwise return false and indicate in *MATCHES
898 which stmts are not isomorphic to the first one. If MATCHES[0]
899 is false then this indicates the comparison could not be
900 carried out or the stmts will never be vectorized by SLP.
901
902 Note COND_EXPR is possibly isomorphic to another one after swapping its
903 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
904 the first stmt by swapping the two operands of comparison; set SWAP[i]
905 to 2 if stmt I is isormorphic to the first stmt by inverting the code
906 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
907 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
908
909 static bool
910 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
911 vec<stmt_vec_info> stmts, unsigned int group_size,
912 poly_uint64 *max_nunits, bool *matches,
913 bool *two_operators, tree *node_vectype)
914 {
915 unsigned int i;
916 stmt_vec_info first_stmt_info = stmts[0];
917 enum tree_code first_stmt_code = ERROR_MARK;
918 enum tree_code alt_stmt_code = ERROR_MARK;
919 enum tree_code rhs_code = ERROR_MARK;
920 enum tree_code first_cond_code = ERROR_MARK;
921 tree lhs;
922 bool need_same_oprnds = false;
923 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
924 optab optab;
925 int icode;
926 machine_mode optab_op2_mode;
927 machine_mode vec_mode;
928 stmt_vec_info first_load = NULL, prev_first_load = NULL;
929 bool first_stmt_load_p = false, load_p = false;
930 bool first_stmt_phi_p = false, phi_p = false;
931
932 /* For every stmt in NODE find its def stmt/s. */
933 stmt_vec_info stmt_info;
934 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
935 {
936 gimple *stmt = stmt_info->stmt;
937 swap[i] = 0;
938 matches[i] = false;
939
940 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
942
943 /* Fail to vectorize statements marked as unvectorizable, throw
944 or are volatile. */
945 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
946 || stmt_can_throw_internal (cfun, stmt)
947 || gimple_has_volatile_ops (stmt))
948 {
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 "Build SLP failed: unvectorizable statement %G",
952 stmt);
953 /* ??? For BB vectorization we want to commutate operands in a way
954 to shuffle all unvectorizable defs into one operand and have
955 the other still vectorized. The following doesn't reliably
956 work for this though but it's the easiest we can do here. */
957 if (is_a <bb_vec_info> (vinfo) && i != 0)
958 continue;
959 /* Fatal mismatch. */
960 matches[0] = false;
961 return false;
962 }
963
964 lhs = gimple_get_lhs (stmt);
965 if (lhs == NULL_TREE)
966 {
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
969 "Build SLP failed: not GIMPLE_ASSIGN nor "
970 "GIMPLE_CALL %G", stmt);
971 if (is_a <bb_vec_info> (vinfo) && i != 0)
972 continue;
973 /* Fatal mismatch. */
974 matches[0] = false;
975 return false;
976 }
977
978 tree nunits_vectype;
979 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
980 &nunits_vectype, group_size)
981 || (nunits_vectype
982 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
983 nunits_vectype, max_nunits)))
984 {
985 if (is_a <bb_vec_info> (vinfo) && i != 0)
986 continue;
987 /* Fatal mismatch. */
988 matches[0] = false;
989 return false;
990 }
991
992 gcc_assert (vectype);
993
994 gcall *call_stmt = dyn_cast <gcall *> (stmt);
995 if (call_stmt)
996 {
997 rhs_code = CALL_EXPR;
998
999 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1000 load_p = true;
1001 else if ((gimple_call_internal_p (call_stmt)
1002 && (!vectorizable_internal_fn_p
1003 (gimple_call_internal_fn (call_stmt))))
1004 || gimple_call_tail_p (call_stmt)
1005 || gimple_call_noreturn_p (call_stmt)
1006 || !gimple_call_nothrow_p (call_stmt)
1007 || gimple_call_chain (call_stmt))
1008 {
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Build SLP failed: unsupported call type %G",
1012 call_stmt);
1013 if (is_a <bb_vec_info> (vinfo) && i != 0)
1014 continue;
1015 /* Fatal mismatch. */
1016 matches[0] = false;
1017 return false;
1018 }
1019 }
1020 else if (gimple_code (stmt) == GIMPLE_PHI)
1021 {
1022 rhs_code = ERROR_MARK;
1023 phi_p = true;
1024 }
1025 else
1026 {
1027 rhs_code = gimple_assign_rhs_code (stmt);
1028 load_p = gimple_vuse (stmt);
1029 }
1030
1031 /* Check the operation. */
1032 if (i == 0)
1033 {
1034 *node_vectype = vectype;
1035 first_stmt_code = rhs_code;
1036 first_stmt_load_p = load_p;
1037 first_stmt_phi_p = phi_p;
1038
1039 /* Shift arguments should be equal in all the packed stmts for a
1040 vector shift with scalar shift operand. */
1041 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1042 || rhs_code == LROTATE_EXPR
1043 || rhs_code == RROTATE_EXPR)
1044 {
1045 vec_mode = TYPE_MODE (vectype);
1046
1047 /* First see if we have a vector/vector shift. */
1048 optab = optab_for_tree_code (rhs_code, vectype,
1049 optab_vector);
1050
1051 if (!optab
1052 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1053 {
1054 /* No vector/vector shift, try for a vector/scalar shift. */
1055 optab = optab_for_tree_code (rhs_code, vectype,
1056 optab_scalar);
1057
1058 if (!optab)
1059 {
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "Build SLP failed: no optab.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1068 }
1069 icode = (int) optab_handler (optab, vec_mode);
1070 if (icode == CODE_FOR_nothing)
1071 {
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "Build SLP failed: "
1075 "op not supported by target.\n");
1076 if (is_a <bb_vec_info> (vinfo) && i != 0)
1077 continue;
1078 /* Fatal mismatch. */
1079 matches[0] = false;
1080 return false;
1081 }
1082 optab_op2_mode = insn_data[icode].operand[2].mode;
1083 if (!VECTOR_MODE_P (optab_op2_mode))
1084 {
1085 need_same_oprnds = true;
1086 first_op1 = gimple_assign_rhs2 (stmt);
1087 }
1088 }
1089 }
1090 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1091 {
1092 need_same_oprnds = true;
1093 first_op1 = gimple_assign_rhs2 (stmt);
1094 }
1095 else if (!load_p
1096 && rhs_code == BIT_FIELD_REF)
1097 {
1098 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1099 if (TREE_CODE (vec) != SSA_NAME
1100 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1101 {
1102 if (is_a <bb_vec_info> (vinfo) && i != 0)
1103 continue;
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1106 "Build SLP failed: "
1107 "BIT_FIELD_REF not supported\n");
1108 /* Fatal mismatch. */
1109 matches[0] = false;
1110 return false;
1111 }
1112 }
1113 else if (call_stmt
1114 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1115 {
1116 need_same_oprnds = true;
1117 first_op1 = gimple_call_arg (call_stmt, 1);
1118 }
1119 }
1120 else
1121 {
1122 if (first_stmt_code != rhs_code
1123 && alt_stmt_code == ERROR_MARK)
1124 alt_stmt_code = rhs_code;
1125 if ((first_stmt_code != rhs_code
1126 && (first_stmt_code != IMAGPART_EXPR
1127 || rhs_code != REALPART_EXPR)
1128 && (first_stmt_code != REALPART_EXPR
1129 || rhs_code != IMAGPART_EXPR)
1130 /* Handle mismatches in plus/minus by computing both
1131 and merging the results. */
1132 && !((first_stmt_code == PLUS_EXPR
1133 || first_stmt_code == MINUS_EXPR)
1134 && (alt_stmt_code == PLUS_EXPR
1135 || alt_stmt_code == MINUS_EXPR)
1136 && rhs_code == alt_stmt_code)
1137 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1138 && (first_stmt_code == ARRAY_REF
1139 || first_stmt_code == BIT_FIELD_REF
1140 || first_stmt_code == INDIRECT_REF
1141 || first_stmt_code == COMPONENT_REF
1142 || first_stmt_code == MEM_REF)))
1143 || first_stmt_load_p != load_p
1144 || first_stmt_phi_p != phi_p)
1145 {
1146 if (dump_enabled_p ())
1147 {
1148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149 "Build SLP failed: different operation "
1150 "in stmt %G", stmt);
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "original stmt %G", first_stmt_info->stmt);
1153 }
1154 /* Mismatch. */
1155 continue;
1156 }
1157
1158 if (need_same_oprnds)
1159 {
1160 tree other_op1 = (call_stmt
1161 ? gimple_call_arg (call_stmt, 1)
1162 : gimple_assign_rhs2 (stmt));
1163 if (!operand_equal_p (first_op1, other_op1, 0))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different shift "
1168 "arguments in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173 if (!load_p
1174 && first_stmt_code == BIT_FIELD_REF
1175 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1176 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1177 {
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180 "Build SLP failed: different BIT_FIELD_REF "
1181 "arguments in %G", stmt);
1182 /* Mismatch. */
1183 continue;
1184 }
1185
1186 if (!load_p && rhs_code == CALL_EXPR)
1187 {
1188 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1189 as_a <gcall *> (stmt)))
1190 {
1191 if (dump_enabled_p ())
1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193 "Build SLP failed: different calls in %G",
1194 stmt);
1195 /* Mismatch. */
1196 continue;
1197 }
1198 }
1199
1200 if (phi_p
1201 && (gimple_bb (first_stmt_info->stmt)
1202 != gimple_bb (stmt_info->stmt)))
1203 {
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206 "Build SLP failed: different BB for PHI "
1207 "in %G", stmt);
1208 /* Mismatch. */
1209 continue;
1210 }
1211
1212 if (!types_compatible_p (vectype, *node_vectype))
1213 {
1214 if (dump_enabled_p ())
1215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216 "Build SLP failed: different vector type "
1217 "in %G", stmt);
1218 /* Mismatch. */
1219 continue;
1220 }
1221 }
1222
1223 /* Grouped store or load. */
1224 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1225 {
1226 if (REFERENCE_CLASS_P (lhs))
1227 {
1228 /* Store. */
1229 ;
1230 }
1231 else
1232 {
1233 /* Load. */
1234 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1235 if (prev_first_load)
1236 {
1237 /* Check that there are no loads from different interleaving
1238 chains in the same node. */
1239 if (prev_first_load != first_load)
1240 {
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1243 vect_location,
1244 "Build SLP failed: different "
1245 "interleaving chains in one node %G",
1246 stmt);
1247 /* Mismatch. */
1248 continue;
1249 }
1250 }
1251 else
1252 prev_first_load = first_load;
1253 }
1254 } /* Grouped access. */
1255 else
1256 {
1257 if (load_p)
1258 {
1259 /* Not grouped load. */
1260 if (dump_enabled_p ())
1261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1262 "Build SLP failed: not grouped load %G", stmt);
1263
1264 /* FORNOW: Not grouped loads are not supported. */
1265 if (is_a <bb_vec_info> (vinfo) && i != 0)
1266 continue;
1267 /* Fatal mismatch. */
1268 matches[0] = false;
1269 return false;
1270 }
1271
1272 /* Not memory operation. */
1273 if (!phi_p
1274 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1275 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1276 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1277 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1278 && rhs_code != VIEW_CONVERT_EXPR
1279 && rhs_code != CALL_EXPR
1280 && rhs_code != BIT_FIELD_REF)
1281 {
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284 "Build SLP failed: operation unsupported %G",
1285 stmt);
1286 if (is_a <bb_vec_info> (vinfo) && i != 0)
1287 continue;
1288 /* Fatal mismatch. */
1289 matches[0] = false;
1290 return false;
1291 }
1292
1293 if (rhs_code == COND_EXPR)
1294 {
1295 tree cond_expr = gimple_assign_rhs1 (stmt);
1296 enum tree_code cond_code = TREE_CODE (cond_expr);
1297 enum tree_code swap_code = ERROR_MARK;
1298 enum tree_code invert_code = ERROR_MARK;
1299
1300 if (i == 0)
1301 first_cond_code = TREE_CODE (cond_expr);
1302 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1303 {
1304 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1305 swap_code = swap_tree_comparison (cond_code);
1306 invert_code = invert_tree_comparison (cond_code, honor_nans);
1307 }
1308
1309 if (first_cond_code == cond_code)
1310 ;
1311 /* Isomorphic can be achieved by swapping. */
1312 else if (first_cond_code == swap_code)
1313 swap[i] = 1;
1314 /* Isomorphic can be achieved by inverting. */
1315 else if (first_cond_code == invert_code)
1316 swap[i] = 2;
1317 else
1318 {
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321 "Build SLP failed: different"
1322 " operation %G", stmt);
1323 /* Mismatch. */
1324 continue;
1325 }
1326 }
1327 }
1328
1329 matches[i] = true;
1330 }
1331
1332 for (i = 0; i < group_size; ++i)
1333 if (!matches[i])
1334 return false;
1335
1336 /* If we allowed a two-operation SLP node verify the target can cope
1337 with the permute we are going to use. */
1338 if (alt_stmt_code != ERROR_MARK
1339 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1340 {
1341 *two_operators = true;
1342 }
1343
1344 return true;
1345 }
1346
1347 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1348 Note we never remove apart from at destruction time so we do not
1349 need a special value for deleted that differs from empty. */
1350 struct bst_traits
1351 {
1352 typedef vec <stmt_vec_info> value_type;
1353 typedef vec <stmt_vec_info> compare_type;
1354 static inline hashval_t hash (value_type);
1355 static inline bool equal (value_type existing, value_type candidate);
1356 static inline bool is_empty (value_type x) { return !x.exists (); }
1357 static inline bool is_deleted (value_type x) { return !x.exists (); }
1358 static const bool empty_zero_p = true;
1359 static inline void mark_empty (value_type &x) { x.release (); }
1360 static inline void mark_deleted (value_type &x) { x.release (); }
1361 static inline void remove (value_type &x) { x.release (); }
1362 };
1363 inline hashval_t
1364 bst_traits::hash (value_type x)
1365 {
1366 inchash::hash h;
1367 for (unsigned i = 0; i < x.length (); ++i)
1368 h.add_int (gimple_uid (x[i]->stmt));
1369 return h.end ();
1370 }
1371 inline bool
1372 bst_traits::equal (value_type existing, value_type candidate)
1373 {
1374 if (existing.length () != candidate.length ())
1375 return false;
1376 for (unsigned i = 0; i < existing.length (); ++i)
1377 if (existing[i] != candidate[i])
1378 return false;
1379 return true;
1380 }
1381
1382 typedef hash_map <vec <gimple *>, slp_tree,
1383 simple_hashmap_traits <bst_traits, slp_tree> >
1384 scalar_stmts_to_slp_tree_map_t;
1385
1386 static slp_tree
1387 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1388 vec<stmt_vec_info> stmts, unsigned int group_size,
1389 poly_uint64 *max_nunits,
1390 bool *matches, unsigned *limit, unsigned *tree_size,
1391 scalar_stmts_to_slp_tree_map_t *bst_map);
1392
1393 static slp_tree
1394 vect_build_slp_tree (vec_info *vinfo,
1395 vec<stmt_vec_info> stmts, unsigned int group_size,
1396 poly_uint64 *max_nunits,
1397 bool *matches, unsigned *limit, unsigned *tree_size,
1398 scalar_stmts_to_slp_tree_map_t *bst_map)
1399 {
1400 if (slp_tree *leader = bst_map->get (stmts))
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1404 *leader ? "" : "failed ", *leader);
1405 if (*leader)
1406 {
1407 SLP_TREE_REF_COUNT (*leader)++;
1408 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1409 }
1410 return *leader;
1411 }
1412
1413 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1414 so we can pick up backedge destinations during discovery. */
1415 slp_tree res = new _slp_tree;
1416 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1417 SLP_TREE_SCALAR_STMTS (res) = stmts;
1418 bst_map->put (stmts.copy (), res);
1419
1420 if (*limit == 0)
1421 {
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "SLP discovery limit exceeded\n");
1425 bool existed_p = bst_map->put (stmts, NULL);
1426 gcc_assert (existed_p);
1427 /* Mark the node invalid so we can detect those when still in use
1428 as backedge destinations. */
1429 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1430 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1431 vect_free_slp_tree (res);
1432 return NULL;
1433 }
1434 --*limit;
1435
1436 poly_uint64 this_max_nunits = 1;
1437 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1438 &this_max_nunits,
1439 matches, limit, tree_size, bst_map);
1440 if (!res_)
1441 {
1442 bool existed_p = bst_map->put (stmts, NULL);
1443 gcc_assert (existed_p);
1444 /* Mark the node invalid so we can detect those when still in use
1445 as backedge destinations. */
1446 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1447 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1448 vect_free_slp_tree (res);
1449 }
1450 else
1451 {
1452 gcc_assert (res_ == res);
1453 res->max_nunits = this_max_nunits;
1454 vect_update_max_nunits (max_nunits, this_max_nunits);
1455 /* Keep a reference for the bst_map use. */
1456 SLP_TREE_REF_COUNT (res)++;
1457 }
1458 return res_;
1459 }
1460
1461 /* Recursively build an SLP tree starting from NODE.
1462 Fail (and return a value not equal to zero) if def-stmts are not
1463 isomorphic, require data permutation or are of unsupported types of
1464 operation. Otherwise, return 0.
1465 The value returned is the depth in the SLP tree where a mismatch
1466 was found. */
1467
1468 static slp_tree
1469 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1470 vec<stmt_vec_info> stmts, unsigned int group_size,
1471 poly_uint64 *max_nunits,
1472 bool *matches, unsigned *limit, unsigned *tree_size,
1473 scalar_stmts_to_slp_tree_map_t *bst_map)
1474 {
1475 unsigned nops, i, this_tree_size = 0;
1476 poly_uint64 this_max_nunits = *max_nunits;
1477
1478 matches[0] = false;
1479
1480 stmt_vec_info stmt_info = stmts[0];
1481 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1482 nops = gimple_call_num_args (stmt);
1483 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1484 {
1485 nops = gimple_num_ops (stmt) - 1;
1486 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1487 nops++;
1488 }
1489 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1490 nops = gimple_phi_num_args (phi);
1491 else
1492 return NULL;
1493
1494 /* If the SLP node is a PHI (induction or reduction), terminate
1495 the recursion. */
1496 bool *skip_args = XALLOCAVEC (bool, nops);
1497 memset (skip_args, 0, sizeof (bool) * nops);
1498 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1499 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1500 {
1501 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1502 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1503 group_size);
1504 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1505 max_nunits))
1506 return NULL;
1507
1508 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1509 if (def_type == vect_induction_def)
1510 {
1511 /* Induction PHIs are not cycles but walk the initial
1512 value. Only for inner loops through, for outer loops
1513 we need to pick up the value from the actual PHIs
1514 to more easily support peeling and epilogue vectorization. */
1515 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1516 if (!nested_in_vect_loop_p (loop, stmt_info))
1517 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1518 else
1519 loop = loop->inner;
1520 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1521 }
1522 else if (def_type == vect_reduction_def
1523 || def_type == vect_double_reduction_def
1524 || def_type == vect_nested_cycle)
1525 {
1526 /* Else def types have to match. */
1527 stmt_vec_info other_info;
1528 bool all_same = true;
1529 FOR_EACH_VEC_ELT (stmts, i, other_info)
1530 {
1531 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1532 return NULL;
1533 if (other_info != stmt_info)
1534 all_same = false;
1535 }
1536 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1537 /* Reduction initial values are not explicitely represented. */
1538 if (!nested_in_vect_loop_p (loop, stmt_info))
1539 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1540 /* Reduction chain backedge defs are filled manually.
1541 ??? Need a better way to identify a SLP reduction chain PHI.
1542 Or a better overall way to SLP match those. */
1543 if (all_same && def_type == vect_reduction_def)
1544 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1545 }
1546 else if (def_type != vect_internal_def)
1547 return NULL;
1548 }
1549
1550
1551 bool two_operators = false;
1552 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1553 tree vectype = NULL_TREE;
1554 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1555 &this_max_nunits, matches, &two_operators,
1556 &vectype))
1557 return NULL;
1558
1559 /* If the SLP node is a load, terminate the recursion unless masked. */
1560 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1561 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1562 {
1563 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1564 {
1565 /* Masked load. */
1566 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1567 nops = 1;
1568 }
1569 else
1570 {
1571 *max_nunits = this_max_nunits;
1572 (*tree_size)++;
1573 node = vect_create_new_slp_node (node, stmts, 0);
1574 SLP_TREE_VECTYPE (node) = vectype;
1575 /* And compute the load permutation. Whether it is actually
1576 a permutation depends on the unrolling factor which is
1577 decided later. */
1578 vec<unsigned> load_permutation;
1579 int j;
1580 stmt_vec_info load_info;
1581 load_permutation.create (group_size);
1582 stmt_vec_info first_stmt_info
1583 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1584 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1585 {
1586 int load_place = vect_get_place_in_interleaving_chain
1587 (load_info, first_stmt_info);
1588 gcc_assert (load_place != -1);
1589 load_permutation.safe_push (load_place);
1590 }
1591 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1592 return node;
1593 }
1594 }
1595 else if (gimple_assign_single_p (stmt_info->stmt)
1596 && !gimple_vuse (stmt_info->stmt)
1597 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1598 {
1599 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1600 the same SSA name vector of a compatible type to vectype. */
1601 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1602 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1603 stmt_vec_info estmt_info;
1604 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1605 {
1606 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1607 tree bfref = gimple_assign_rhs1 (estmt);
1608 HOST_WIDE_INT lane;
1609 if (!known_eq (bit_field_size (bfref),
1610 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1611 || !constant_multiple_p (bit_field_offset (bfref),
1612 bit_field_size (bfref), &lane))
1613 {
1614 lperm.release ();
1615 return NULL;
1616 }
1617 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1618 }
1619 slp_tree vnode = vect_create_new_slp_node (vNULL);
1620 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1621 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1622 /* We are always building a permutation node even if it is an identity
1623 permute to shield the rest of the vectorizer from the odd node
1624 representing an actual vector without any scalar ops.
1625 ??? We could hide it completely with making the permute node
1626 external? */
1627 node = vect_create_new_slp_node (node, stmts, 1);
1628 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1629 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1630 SLP_TREE_VECTYPE (node) = vectype;
1631 SLP_TREE_CHILDREN (node).quick_push (vnode);
1632 return node;
1633 }
1634
1635 /* Get at the operands, verifying they are compatible. */
1636 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1637 slp_oprnd_info oprnd_info;
1638 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1639 {
1640 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1641 stmts, i, &oprnds_info);
1642 if (res != 0)
1643 matches[(res == -1) ? 0 : i] = false;
1644 if (!matches[0])
1645 break;
1646 }
1647 for (i = 0; i < group_size; ++i)
1648 if (!matches[i])
1649 {
1650 vect_free_oprnd_info (oprnds_info);
1651 return NULL;
1652 }
1653 swap = NULL;
1654
1655 auto_vec<slp_tree, 4> children;
1656
1657 stmt_info = stmts[0];
1658
1659 /* Create SLP_TREE nodes for the definition node/s. */
1660 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1661 {
1662 slp_tree child;
1663 unsigned int j;
1664
1665 /* We're skipping certain operands from processing, for example
1666 outer loop reduction initial defs. */
1667 if (skip_args[i])
1668 {
1669 children.safe_push (NULL);
1670 continue;
1671 }
1672
1673 if (oprnd_info->first_dt == vect_uninitialized_def)
1674 {
1675 /* COND_EXPR have one too many eventually if the condition
1676 is a SSA name. */
1677 gcc_assert (i == 3 && nops == 4);
1678 continue;
1679 }
1680
1681 if (is_a <bb_vec_info> (vinfo)
1682 && oprnd_info->first_dt == vect_internal_def
1683 && !oprnd_info->any_pattern)
1684 {
1685 /* For BB vectorization, if all defs are the same do not
1686 bother to continue the build along the single-lane
1687 graph but use a splat of the scalar value. */
1688 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1689 for (j = 1; j < group_size; ++j)
1690 if (oprnd_info->def_stmts[j] != first_def)
1691 break;
1692 if (j == group_size
1693 /* But avoid doing this for loads where we may be
1694 able to CSE things, unless the stmt is not
1695 vectorizable. */
1696 && (!STMT_VINFO_VECTORIZABLE (first_def)
1697 || !gimple_vuse (first_def->stmt)))
1698 {
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_NOTE, vect_location,
1701 "Using a splat of the uniform operand\n");
1702 oprnd_info->first_dt = vect_external_def;
1703 }
1704 }
1705
1706 if (oprnd_info->first_dt == vect_external_def
1707 || oprnd_info->first_dt == vect_constant_def)
1708 {
1709 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1710 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1711 oprnd_info->ops = vNULL;
1712 children.safe_push (invnode);
1713 continue;
1714 }
1715
1716 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1717 group_size, &this_max_nunits,
1718 matches, limit,
1719 &this_tree_size, bst_map)) != NULL)
1720 {
1721 oprnd_info->def_stmts = vNULL;
1722 children.safe_push (child);
1723 continue;
1724 }
1725
1726 /* If the SLP build for operand zero failed and operand zero
1727 and one can be commutated try that for the scalar stmts
1728 that failed the match. */
1729 if (i == 0
1730 /* A first scalar stmt mismatch signals a fatal mismatch. */
1731 && matches[0]
1732 /* ??? For COND_EXPRs we can swap the comparison operands
1733 as well as the arms under some constraints. */
1734 && nops == 2
1735 && oprnds_info[1]->first_dt == vect_internal_def
1736 && is_gimple_assign (stmt_info->stmt)
1737 /* Swapping operands for reductions breaks assumptions later on. */
1738 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1739 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1740 {
1741 /* See whether we can swap the matching or the non-matching
1742 stmt operands. */
1743 bool swap_not_matching = true;
1744 do
1745 {
1746 for (j = 0; j < group_size; ++j)
1747 {
1748 if (matches[j] != !swap_not_matching)
1749 continue;
1750 stmt_vec_info stmt_info = stmts[j];
1751 /* Verify if we can swap operands of this stmt. */
1752 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1753 if (!stmt
1754 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1755 {
1756 if (!swap_not_matching)
1757 goto fail;
1758 swap_not_matching = false;
1759 break;
1760 }
1761 }
1762 }
1763 while (j != group_size);
1764
1765 /* Swap mismatched definition stmts. */
1766 if (dump_enabled_p ())
1767 dump_printf_loc (MSG_NOTE, vect_location,
1768 "Re-trying with swapped operands of stmts ");
1769 for (j = 0; j < group_size; ++j)
1770 if (matches[j] == !swap_not_matching)
1771 {
1772 std::swap (oprnds_info[0]->def_stmts[j],
1773 oprnds_info[1]->def_stmts[j]);
1774 std::swap (oprnds_info[0]->ops[j],
1775 oprnds_info[1]->ops[j]);
1776 if (dump_enabled_p ())
1777 dump_printf (MSG_NOTE, "%d ", j);
1778 }
1779 if (dump_enabled_p ())
1780 dump_printf (MSG_NOTE, "\n");
1781 /* And try again with scratch 'matches' ... */
1782 bool *tem = XALLOCAVEC (bool, group_size);
1783 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1784 group_size, &this_max_nunits,
1785 tem, limit,
1786 &this_tree_size, bst_map)) != NULL)
1787 {
1788 oprnd_info->def_stmts = vNULL;
1789 children.safe_push (child);
1790 continue;
1791 }
1792 }
1793 fail:
1794
1795 /* If the SLP build failed and we analyze a basic-block
1796 simply treat nodes we fail to build as externally defined
1797 (and thus build vectors from the scalar defs).
1798 The cost model will reject outright expensive cases.
1799 ??? This doesn't treat cases where permutation ultimatively
1800 fails (or we don't try permutation below). Ideally we'd
1801 even compute a permutation that will end up with the maximum
1802 SLP tree size... */
1803 if (is_a <bb_vec_info> (vinfo)
1804 /* ??? Rejecting patterns this way doesn't work. We'd have to
1805 do extra work to cancel the pattern so the uses see the
1806 scalar version. */
1807 && !is_pattern_stmt_p (stmt_info)
1808 && !oprnd_info->any_pattern)
1809 {
1810 /* But if there's a leading vector sized set of matching stmts
1811 fail here so we can split the group. This matches the condition
1812 vect_analyze_slp_instance uses. */
1813 /* ??? We might want to split here and combine the results to support
1814 multiple vector sizes better. */
1815 for (j = 0; j < group_size; ++j)
1816 if (!matches[j])
1817 break;
1818 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1819 {
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE, vect_location,
1822 "Building vector operands from scalars\n");
1823 this_tree_size++;
1824 child = vect_create_new_slp_node (oprnd_info->ops);
1825 children.safe_push (child);
1826 oprnd_info->ops = vNULL;
1827 continue;
1828 }
1829 }
1830
1831 gcc_assert (child == NULL);
1832 FOR_EACH_VEC_ELT (children, j, child)
1833 if (child)
1834 vect_free_slp_tree (child);
1835 vect_free_oprnd_info (oprnds_info);
1836 return NULL;
1837 }
1838
1839 vect_free_oprnd_info (oprnds_info);
1840
1841 /* If we have all children of a child built up from uniform scalars
1842 or does more than one possibly expensive vector construction then
1843 just throw that away, causing it built up from scalars.
1844 The exception is the SLP node for the vector store. */
1845 if (is_a <bb_vec_info> (vinfo)
1846 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1847 /* ??? Rejecting patterns this way doesn't work. We'd have to
1848 do extra work to cancel the pattern so the uses see the
1849 scalar version. */
1850 && !is_pattern_stmt_p (stmt_info))
1851 {
1852 slp_tree child;
1853 unsigned j;
1854 bool all_uniform_p = true;
1855 unsigned n_vector_builds = 0;
1856 FOR_EACH_VEC_ELT (children, j, child)
1857 {
1858 if (!child)
1859 ;
1860 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1861 all_uniform_p = false;
1862 else if (!vect_slp_tree_uniform_p (child))
1863 {
1864 all_uniform_p = false;
1865 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1866 n_vector_builds++;
1867 }
1868 }
1869 if (all_uniform_p || n_vector_builds > 1)
1870 {
1871 /* Roll back. */
1872 matches[0] = false;
1873 FOR_EACH_VEC_ELT (children, j, child)
1874 if (child)
1875 vect_free_slp_tree (child);
1876
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "Building parent vector operands from "
1880 "scalars instead\n");
1881 return NULL;
1882 }
1883 }
1884
1885 *tree_size += this_tree_size + 1;
1886 *max_nunits = this_max_nunits;
1887
1888 if (two_operators)
1889 {
1890 /* ??? We'd likely want to either cache in bst_map sth like
1891 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1892 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1893 explicit stmts to put in so the keying on 'stmts' doesn't
1894 work (but we have the same issue with nodes that use 'ops'). */
1895 slp_tree one = new _slp_tree;
1896 slp_tree two = new _slp_tree;
1897 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1898 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1899 SLP_TREE_VECTYPE (one) = vectype;
1900 SLP_TREE_VECTYPE (two) = vectype;
1901 SLP_TREE_CHILDREN (one).safe_splice (children);
1902 SLP_TREE_CHILDREN (two).safe_splice (children);
1903 slp_tree child;
1904 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1905 SLP_TREE_REF_COUNT (child)++;
1906
1907 /* Here we record the original defs since this
1908 node represents the final lane configuration. */
1909 node = vect_create_new_slp_node (node, stmts, 2);
1910 SLP_TREE_VECTYPE (node) = vectype;
1911 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1912 SLP_TREE_CHILDREN (node).quick_push (one);
1913 SLP_TREE_CHILDREN (node).quick_push (two);
1914 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1915 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1916 enum tree_code ocode = ERROR_MARK;
1917 stmt_vec_info ostmt_info;
1918 unsigned j = 0;
1919 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1920 {
1921 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1922 if (gimple_assign_rhs_code (ostmt) != code0)
1923 {
1924 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1925 ocode = gimple_assign_rhs_code (ostmt);
1926 j = i;
1927 }
1928 else
1929 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1930 }
1931 SLP_TREE_CODE (one) = code0;
1932 SLP_TREE_CODE (two) = ocode;
1933 SLP_TREE_LANES (one) = stmts.length ();
1934 SLP_TREE_LANES (two) = stmts.length ();
1935 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1936 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1937 return node;
1938 }
1939
1940 node = vect_create_new_slp_node (node, stmts, nops);
1941 SLP_TREE_VECTYPE (node) = vectype;
1942 SLP_TREE_CHILDREN (node).splice (children);
1943 return node;
1944 }
1945
1946 /* Dump a single SLP tree NODE. */
1947
1948 static void
1949 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1950 slp_tree node)
1951 {
1952 unsigned i, j;
1953 slp_tree child;
1954 stmt_vec_info stmt_info;
1955 tree op;
1956
1957 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1958 dump_user_location_t user_loc = loc.get_user_location ();
1959 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1960 SLP_TREE_DEF_TYPE (node) == vect_external_def
1961 ? " (external)"
1962 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1963 ? " (constant)"
1964 : ""), node,
1965 estimated_poly_value (node->max_nunits),
1966 SLP_TREE_REF_COUNT (node));
1967 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1968 {
1969 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
1970 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
1971 else
1972 dump_printf_loc (metadata, user_loc, "op template: %G",
1973 SLP_TREE_REPRESENTATIVE (node)->stmt);
1974 }
1975 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1977 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1978 else
1979 {
1980 dump_printf_loc (metadata, user_loc, "\t{ ");
1981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1982 dump_printf (metadata, "%T%s ", op,
1983 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1984 dump_printf (metadata, "}\n");
1985 }
1986 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1987 {
1988 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1989 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1990 dump_printf (dump_kind, " %u", j);
1991 dump_printf (dump_kind, " }\n");
1992 }
1993 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1994 {
1995 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1996 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1997 dump_printf (dump_kind, " %u[%u]",
1998 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1999 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2000 dump_printf (dump_kind, " }\n");
2001 }
2002 if (SLP_TREE_CHILDREN (node).is_empty ())
2003 return;
2004 dump_printf_loc (metadata, user_loc, "\tchildren");
2005 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2006 dump_printf (dump_kind, " %p", (void *)child);
2007 dump_printf (dump_kind, "\n");
2008 }
2009
2010 DEBUG_FUNCTION void
2011 debug (slp_tree node)
2012 {
2013 debug_dump_context ctx;
2014 vect_print_slp_tree (MSG_NOTE,
2015 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2016 node);
2017 }
2018
2019 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2020
2021 static void
2022 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2023 slp_tree node, hash_set<slp_tree> &visited)
2024 {
2025 unsigned i;
2026 slp_tree child;
2027
2028 if (visited.add (node))
2029 return;
2030
2031 vect_print_slp_tree (dump_kind, loc, node);
2032
2033 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2034 if (child)
2035 vect_print_slp_graph (dump_kind, loc, child, visited);
2036 }
2037
2038 static void
2039 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2040 slp_tree entry)
2041 {
2042 hash_set<slp_tree> visited;
2043 vect_print_slp_graph (dump_kind, loc, entry, visited);
2044 }
2045
2046 /* Mark the tree rooted at NODE with PURE_SLP. */
2047
2048 static void
2049 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2050 {
2051 int i;
2052 stmt_vec_info stmt_info;
2053 slp_tree child;
2054
2055 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2056 return;
2057
2058 if (visited.add (node))
2059 return;
2060
2061 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2062 STMT_SLP_TYPE (stmt_info) = pure_slp;
2063
2064 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2065 if (child)
2066 vect_mark_slp_stmts (child, visited);
2067 }
2068
2069 static void
2070 vect_mark_slp_stmts (slp_tree node)
2071 {
2072 hash_set<slp_tree> visited;
2073 vect_mark_slp_stmts (node, visited);
2074 }
2075
2076 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2077
2078 static void
2079 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2080 {
2081 int i;
2082 stmt_vec_info stmt_info;
2083 slp_tree child;
2084
2085 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2086 return;
2087
2088 if (visited.add (node))
2089 return;
2090
2091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2092 {
2093 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2094 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2095 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2096 }
2097
2098 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2099 if (child)
2100 vect_mark_slp_stmts_relevant (child, visited);
2101 }
2102
2103 static void
2104 vect_mark_slp_stmts_relevant (slp_tree node)
2105 {
2106 hash_set<slp_tree> visited;
2107 vect_mark_slp_stmts_relevant (node, visited);
2108 }
2109
2110
2111 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2112
2113 static void
2114 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2115 hash_set<slp_tree> &visited)
2116 {
2117 if (!node || visited.add (node))
2118 return;
2119
2120 if (SLP_TREE_CHILDREN (node).length () == 0)
2121 {
2122 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2123 return;
2124 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2125 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2126 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2127 loads.safe_push (node);
2128 }
2129 else
2130 {
2131 unsigned i;
2132 slp_tree child;
2133 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2134 vect_gather_slp_loads (loads, child, visited);
2135 }
2136 }
2137
2138
2139 /* Find the last store in SLP INSTANCE. */
2140
2141 stmt_vec_info
2142 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2143 {
2144 stmt_vec_info last = NULL;
2145 stmt_vec_info stmt_vinfo;
2146
2147 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2148 {
2149 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2150 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2151 }
2152
2153 return last;
2154 }
2155
2156 /* Find the first stmt in NODE. */
2157
2158 stmt_vec_info
2159 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2160 {
2161 stmt_vec_info first = NULL;
2162 stmt_vec_info stmt_vinfo;
2163
2164 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2165 {
2166 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2167 if (!first
2168 || get_later_stmt (stmt_vinfo, first) == first)
2169 first = stmt_vinfo;
2170 }
2171
2172 return first;
2173 }
2174
2175 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2176 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2177 (also containing the first GROUP1_SIZE stmts, since stores are
2178 consecutive), the second containing the remainder.
2179 Return the first stmt in the second group. */
2180
2181 static stmt_vec_info
2182 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2183 {
2184 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2185 gcc_assert (group1_size > 0);
2186 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2187 gcc_assert (group2_size > 0);
2188 DR_GROUP_SIZE (first_vinfo) = group1_size;
2189
2190 stmt_vec_info stmt_info = first_vinfo;
2191 for (unsigned i = group1_size; i > 1; i--)
2192 {
2193 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2194 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2195 }
2196 /* STMT is now the last element of the first group. */
2197 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2198 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2199
2200 DR_GROUP_SIZE (group2) = group2_size;
2201 for (stmt_info = group2; stmt_info;
2202 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2203 {
2204 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2205 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2206 }
2207
2208 /* For the second group, the DR_GROUP_GAP is that before the original group,
2209 plus skipping over the first vector. */
2210 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2211
2212 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2213 DR_GROUP_GAP (first_vinfo) += group2_size;
2214
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2217 group1_size, group2_size);
2218
2219 return group2;
2220 }
2221
2222 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2223 statements and a vector of NUNITS elements. */
2224
2225 static poly_uint64
2226 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2227 {
2228 return exact_div (common_multiple (nunits, group_size), group_size);
2229 }
2230
2231 /* Helper function of vect_match_slp_patterns.
2232
2233 Attempts to match patterns against the slp tree rooted in REF_NODE using
2234 VINFO. Patterns are matched in post-order traversal.
2235
2236 If matching is successful the value in REF_NODE is updated and returned, if
2237 not then it is returned unchanged. */
2238
2239 static bool
2240 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2241 slp_tree_to_load_perm_map_t *perm_cache,
2242 hash_set<slp_tree> *visited)
2243 {
2244 unsigned i;
2245 slp_tree node = *ref_node;
2246 bool found_p = false;
2247 if (!node || visited->add (node))
2248 return false;
2249
2250 slp_tree child;
2251 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2252 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2253 vinfo, perm_cache, visited);
2254
2255 for (unsigned x = 0; x < num__slp_patterns; x++)
2256 {
2257 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2258 if (pattern)
2259 {
2260 pattern->build (vinfo);
2261 delete pattern;
2262 found_p = true;
2263 }
2264 }
2265
2266 return found_p;
2267 }
2268
2269 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2270 vec_info VINFO.
2271
2272 The modified tree is returned. Patterns are tried in order and multiple
2273 patterns may match. */
2274
2275 static bool
2276 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2277 hash_set<slp_tree> *visited,
2278 slp_tree_to_load_perm_map_t *perm_cache,
2279 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
2280 {
2281 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2282 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2283
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "Analyzing SLP tree %p for patterns\n",
2287 SLP_INSTANCE_TREE (instance));
2288
2289 bool found_p
2290 = vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2291
2292 if (found_p)
2293 {
2294 if (dump_enabled_p ())
2295 {
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "Pattern matched SLP tree\n");
2298 vect_print_slp_graph (MSG_NOTE, vect_location, *ref_node);
2299 }
2300 }
2301
2302 return found_p;
2303 }
2304
2305 /* Analyze an SLP instance starting from a group of grouped stores. Call
2306 vect_build_slp_tree to build a tree of packed stmts if possible.
2307 Return FALSE if it's impossible to SLP any stmt in the loop. */
2308
2309 static bool
2310 vect_analyze_slp_instance (vec_info *vinfo,
2311 scalar_stmts_to_slp_tree_map_t *bst_map,
2312 stmt_vec_info stmt_info, slp_instance_kind kind,
2313 unsigned max_tree_size, unsigned *limit);
2314
2315 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2316 of KIND. Return true if successful. */
2317
2318 static bool
2319 vect_build_slp_instance (vec_info *vinfo,
2320 slp_instance_kind kind,
2321 vec<stmt_vec_info> &scalar_stmts,
2322 stmt_vec_info root_stmt_info,
2323 unsigned max_tree_size, unsigned *limit,
2324 scalar_stmts_to_slp_tree_map_t *bst_map,
2325 /* ??? We need stmt_info for group splitting. */
2326 stmt_vec_info stmt_info_)
2327 {
2328 if (dump_enabled_p ())
2329 {
2330 dump_printf_loc (MSG_NOTE, vect_location,
2331 "Starting SLP discovery for\n");
2332 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 " %G", scalar_stmts[i]->stmt);
2335 }
2336
2337 /* Build the tree for the SLP instance. */
2338 unsigned int group_size = scalar_stmts.length ();
2339 bool *matches = XALLOCAVEC (bool, group_size);
2340 poly_uint64 max_nunits = 1;
2341 unsigned tree_size = 0;
2342 unsigned i;
2343 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2344 &max_nunits, matches, limit,
2345 &tree_size, bst_map);
2346 if (node != NULL)
2347 {
2348 /* Calculate the unrolling factor based on the smallest type. */
2349 poly_uint64 unrolling_factor
2350 = calculate_unrolling_factor (max_nunits, group_size);
2351
2352 if (maybe_ne (unrolling_factor, 1U)
2353 && is_a <bb_vec_info> (vinfo))
2354 {
2355 unsigned HOST_WIDE_INT const_max_nunits;
2356 if (!max_nunits.is_constant (&const_max_nunits)
2357 || const_max_nunits > group_size)
2358 {
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 "Build SLP failed: store group "
2362 "size not a multiple of the vector size "
2363 "in basic block SLP\n");
2364 vect_free_slp_tree (node);
2365 return false;
2366 }
2367 /* Fatal mismatch. */
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_NOTE, vect_location,
2370 "SLP discovery succeeded but node needs "
2371 "splitting\n");
2372 memset (matches, true, group_size);
2373 matches[group_size / const_max_nunits * const_max_nunits] = false;
2374 vect_free_slp_tree (node);
2375 }
2376 else
2377 {
2378 /* Create a new SLP instance. */
2379 slp_instance new_instance = XNEW (class _slp_instance);
2380 SLP_INSTANCE_TREE (new_instance) = node;
2381 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2382 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2383 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2384 SLP_INSTANCE_KIND (new_instance) = kind;
2385 new_instance->reduc_phis = NULL;
2386 new_instance->cost_vec = vNULL;
2387 new_instance->subgraph_entries = vNULL;
2388
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE, vect_location,
2391 "SLP size %u vs. limit %u.\n",
2392 tree_size, max_tree_size);
2393
2394 /* Fixup SLP reduction chains. */
2395 if (kind == slp_inst_kind_reduc_chain)
2396 {
2397 /* If this is a reduction chain with a conversion in front
2398 amend the SLP tree with a node for that. */
2399 gimple *scalar_def
2400 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2401 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2402 {
2403 /* Get at the conversion stmt - we know it's the single use
2404 of the last stmt of the reduction chain. */
2405 use_operand_p use_p;
2406 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2407 &use_p, &scalar_def);
2408 gcc_assert (r);
2409 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2410 next_info = vect_stmt_to_vectorize (next_info);
2411 scalar_stmts = vNULL;
2412 scalar_stmts.create (group_size);
2413 for (unsigned i = 0; i < group_size; ++i)
2414 scalar_stmts.quick_push (next_info);
2415 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2416 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2417 SLP_TREE_CHILDREN (conv).quick_push (node);
2418 SLP_INSTANCE_TREE (new_instance) = conv;
2419 /* We also have to fake this conversion stmt as SLP reduction
2420 group so we don't have to mess with too much code
2421 elsewhere. */
2422 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2423 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2424 }
2425 /* Fill the backedge child of the PHI SLP node. The
2426 general matching code cannot find it because the
2427 scalar code does not reflect how we vectorize the
2428 reduction. */
2429 use_operand_p use_p;
2430 imm_use_iterator imm_iter;
2431 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2432 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2433 gimple_get_lhs (scalar_def))
2434 /* There are exactly two non-debug uses, the reduction
2435 PHI and the loop-closed PHI node. */
2436 if (!is_gimple_debug (USE_STMT (use_p))
2437 && gimple_bb (USE_STMT (use_p)) == loop->header)
2438 {
2439 auto_vec<stmt_vec_info, 64> phis (group_size);
2440 stmt_vec_info phi_info
2441 = vinfo->lookup_stmt (USE_STMT (use_p));
2442 for (unsigned i = 0; i < group_size; ++i)
2443 phis.quick_push (phi_info);
2444 slp_tree *phi_node = bst_map->get (phis);
2445 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2446 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2447 = SLP_INSTANCE_TREE (new_instance);
2448 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2449 }
2450 }
2451
2452 vinfo->slp_instances.safe_push (new_instance);
2453
2454 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2455 the number of scalar stmts in the root in a few places.
2456 Verify that assumption holds. */
2457 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2458 .length () == group_size);
2459
2460 if (dump_enabled_p ())
2461 {
2462 dump_printf_loc (MSG_NOTE, vect_location,
2463 "Final SLP tree for instance %p:\n", new_instance);
2464 vect_print_slp_graph (MSG_NOTE, vect_location,
2465 SLP_INSTANCE_TREE (new_instance));
2466 }
2467
2468 return true;
2469 }
2470 }
2471 else
2472 {
2473 /* Failed to SLP. */
2474 /* Free the allocated memory. */
2475 scalar_stmts.release ();
2476 }
2477
2478 stmt_vec_info stmt_info = stmt_info_;
2479 /* Try to break the group up into pieces. */
2480 if (kind == slp_inst_kind_store)
2481 {
2482 /* ??? We could delay all the actual splitting of store-groups
2483 until after SLP discovery of the original group completed.
2484 Then we can recurse to vect_build_slp_instance directly. */
2485 for (i = 0; i < group_size; i++)
2486 if (!matches[i])
2487 break;
2488
2489 /* For basic block SLP, try to break the group up into multiples of
2490 a vector size. */
2491 if (is_a <bb_vec_info> (vinfo)
2492 && (i > 1 && i < group_size))
2493 {
2494 tree scalar_type
2495 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2496 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2497 1 << floor_log2 (i));
2498 unsigned HOST_WIDE_INT const_nunits;
2499 if (vectype
2500 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2501 {
2502 /* Split into two groups at the first vector boundary. */
2503 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2504 unsigned group1_size = i & ~(const_nunits - 1);
2505
2506 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_NOTE, vect_location,
2508 "Splitting SLP group at stmt %u\n", i);
2509 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2510 group1_size);
2511 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2512 kind, max_tree_size,
2513 limit);
2514 /* Split the rest at the failure point and possibly
2515 re-analyze the remaining matching part if it has
2516 at least two lanes. */
2517 if (group1_size < i
2518 && (i + 1 < group_size
2519 || i - group1_size > 1))
2520 {
2521 stmt_vec_info rest2 = rest;
2522 rest = vect_split_slp_store_group (rest, i - group1_size);
2523 if (i - group1_size > 1)
2524 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2525 kind, max_tree_size,
2526 limit);
2527 }
2528 /* Re-analyze the non-matching tail if it has at least
2529 two lanes. */
2530 if (i + 1 < group_size)
2531 res |= vect_analyze_slp_instance (vinfo, bst_map,
2532 rest, kind, max_tree_size,
2533 limit);
2534 return res;
2535 }
2536 }
2537
2538 /* For loop vectorization split into arbitrary pieces of size > 1. */
2539 if (is_a <loop_vec_info> (vinfo)
2540 && (i > 1 && i < group_size))
2541 {
2542 unsigned group1_size = i;
2543
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location,
2546 "Splitting SLP group at stmt %u\n", i);
2547
2548 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2549 group1_size);
2550 /* Loop vectorization cannot handle gaps in stores, make sure
2551 the split group appears as strided. */
2552 STMT_VINFO_STRIDED_P (rest) = 1;
2553 DR_GROUP_GAP (rest) = 0;
2554 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2555 DR_GROUP_GAP (stmt_info) = 0;
2556
2557 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2558 kind, max_tree_size, limit);
2559 if (i + 1 < group_size)
2560 res |= vect_analyze_slp_instance (vinfo, bst_map,
2561 rest, kind, max_tree_size, limit);
2562
2563 return res;
2564 }
2565
2566 /* Even though the first vector did not all match, we might be able to SLP
2567 (some) of the remainder. FORNOW ignore this possibility. */
2568 }
2569
2570 /* Failed to SLP. */
2571 if (dump_enabled_p ())
2572 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2573 return false;
2574 }
2575
2576
2577 /* Analyze an SLP instance starting from a group of grouped stores. Call
2578 vect_build_slp_tree to build a tree of packed stmts if possible.
2579 Return FALSE if it's impossible to SLP any stmt in the loop. */
2580
2581 static bool
2582 vect_analyze_slp_instance (vec_info *vinfo,
2583 scalar_stmts_to_slp_tree_map_t *bst_map,
2584 stmt_vec_info stmt_info,
2585 slp_instance_kind kind,
2586 unsigned max_tree_size, unsigned *limit)
2587 {
2588 unsigned int i;
2589 vec<stmt_vec_info> scalar_stmts;
2590
2591 if (is_a <bb_vec_info> (vinfo))
2592 vect_location = stmt_info->stmt;
2593
2594 stmt_vec_info next_info = stmt_info;
2595 if (kind == slp_inst_kind_store)
2596 {
2597 /* Collect the stores and store them in scalar_stmts. */
2598 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2599 while (next_info)
2600 {
2601 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2602 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2603 }
2604 }
2605 else if (kind == slp_inst_kind_reduc_chain)
2606 {
2607 /* Collect the reduction stmts and store them in scalar_stmts. */
2608 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2609 while (next_info)
2610 {
2611 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2612 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2613 }
2614 /* Mark the first element of the reduction chain as reduction to properly
2615 transform the node. In the reduction analysis phase only the last
2616 element of the chain is marked as reduction. */
2617 STMT_VINFO_DEF_TYPE (stmt_info)
2618 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2619 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2620 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2621 }
2622 else if (kind == slp_inst_kind_ctor)
2623 {
2624 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2625 tree val;
2626 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2627 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2628 {
2629 stmt_vec_info def_info = vinfo->lookup_def (val);
2630 def_info = vect_stmt_to_vectorize (def_info);
2631 scalar_stmts.quick_push (def_info);
2632 }
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_NOTE, vect_location,
2635 "Analyzing vectorizable constructor: %G\n",
2636 stmt_info->stmt);
2637 }
2638 else if (kind == slp_inst_kind_reduc_group)
2639 {
2640 /* Collect reduction statements. */
2641 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2642 scalar_stmts.create (reductions.length ());
2643 for (i = 0; reductions.iterate (i, &next_info); i++)
2644 if (STMT_VINFO_RELEVANT_P (next_info)
2645 || STMT_VINFO_LIVE_P (next_info))
2646 scalar_stmts.quick_push (next_info);
2647 /* If less than two were relevant/live there's nothing to SLP. */
2648 if (scalar_stmts.length () < 2)
2649 return false;
2650 }
2651 else
2652 gcc_unreachable ();
2653
2654 /* Build the tree for the SLP instance. */
2655 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2656 kind == slp_inst_kind_ctor
2657 ? stmt_info : NULL,
2658 max_tree_size, limit, bst_map,
2659 kind == slp_inst_kind_store
2660 ? stmt_info : NULL);
2661
2662 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2663 where we should do store group splitting. */
2664
2665 return res;
2666 }
2667
2668 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2669 trees of packed scalar stmts if SLP is possible. */
2670
2671 opt_result
2672 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2673 {
2674 unsigned int i;
2675 stmt_vec_info first_element;
2676 slp_instance instance;
2677
2678 DUMP_VECT_SCOPE ("vect_analyze_slp");
2679
2680 unsigned limit = max_tree_size;
2681
2682 scalar_stmts_to_slp_tree_map_t *bst_map
2683 = new scalar_stmts_to_slp_tree_map_t ();
2684
2685 /* Find SLP sequences starting from groups of grouped stores. */
2686 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2687 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2688 STMT_VINFO_GROUPED_ACCESS (first_element)
2689 ? slp_inst_kind_store : slp_inst_kind_ctor,
2690 max_tree_size, &limit);
2691
2692 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2693 {
2694 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2695 {
2696 vect_location = bb_vinfo->roots[i].root->stmt;
2697 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2698 bb_vinfo->roots[i].stmts,
2699 bb_vinfo->roots[i].root,
2700 max_tree_size, &limit, bst_map, NULL))
2701 bb_vinfo->roots[i].stmts = vNULL;
2702 }
2703 }
2704
2705 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2706 {
2707 /* Find SLP sequences starting from reduction chains. */
2708 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2709 if (! STMT_VINFO_RELEVANT_P (first_element)
2710 && ! STMT_VINFO_LIVE_P (first_element))
2711 ;
2712 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2713 slp_inst_kind_reduc_chain,
2714 max_tree_size, &limit))
2715 {
2716 /* Dissolve reduction chain group. */
2717 stmt_vec_info vinfo = first_element;
2718 stmt_vec_info last = NULL;
2719 while (vinfo)
2720 {
2721 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2722 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2723 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2724 last = vinfo;
2725 vinfo = next;
2726 }
2727 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2728 /* It can be still vectorized as part of an SLP reduction. */
2729 loop_vinfo->reductions.safe_push (last);
2730 }
2731
2732 /* Find SLP sequences starting from groups of reductions. */
2733 if (loop_vinfo->reductions.length () > 1)
2734 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2735 slp_inst_kind_reduc_group, max_tree_size,
2736 &limit);
2737 }
2738
2739 hash_set<slp_tree> visited_patterns;
2740 slp_tree_to_load_perm_map_t perm_cache;
2741 /* See if any patterns can be found in the SLP tree. */
2742 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2743 vect_match_slp_patterns (instance, vinfo, &visited_patterns, &perm_cache,
2744 bst_map);
2745
2746 /* The map keeps a reference on SLP nodes built, release that. */
2747 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2748 it != bst_map->end (); ++it)
2749 if ((*it).second)
2750 vect_free_slp_tree ((*it).second);
2751 delete bst_map;
2752
2753 return opt_result::success ();
2754 }
2755
2756 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2757
2758 static void
2759 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2760 vec<slp_tree> &vertices, vec<int> &leafs)
2761 {
2762 unsigned i;
2763 slp_tree child;
2764
2765 if (visited.add (node))
2766 return;
2767
2768 node->vertex = vertices.length ();
2769 vertices.safe_push (node);
2770
2771 bool leaf = true;
2772 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2773 if (child)
2774 {
2775 leaf = false;
2776 vect_slp_build_vertices (visited, child, vertices, leafs);
2777 }
2778 if (leaf)
2779 leafs.safe_push (node->vertex);
2780 }
2781
2782 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2783
2784 static void
2785 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2786 vec<int> &leafs)
2787 {
2788 hash_set<slp_tree> visited;
2789 unsigned i;
2790 slp_instance instance;
2791 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2792 {
2793 unsigned n_v = vertices.length ();
2794 unsigned n_l = leafs.length ();
2795 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2796 leafs);
2797 /* If we added vertices but no entries to the reverse graph we've
2798 added a cycle that is not backwards-reachable. Push the entry
2799 to mimic as leaf then. */
2800 if (vertices.length () > n_v
2801 && leafs.length () == n_l)
2802 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2803 }
2804 }
2805
2806 /* Apply (reverse) bijectite PERM to VEC. */
2807
2808 template <class T>
2809 static void
2810 vect_slp_permute (vec<unsigned> perm,
2811 vec<T> &vec, bool reverse)
2812 {
2813 auto_vec<T, 64> saved;
2814 saved.create (vec.length ());
2815 for (unsigned i = 0; i < vec.length (); ++i)
2816 saved.quick_push (vec[i]);
2817
2818 if (reverse)
2819 {
2820 for (unsigned i = 0; i < vec.length (); ++i)
2821 vec[perm[i]] = saved[i];
2822 for (unsigned i = 0; i < vec.length (); ++i)
2823 gcc_assert (vec[perm[i]] == saved[i]);
2824 }
2825 else
2826 {
2827 for (unsigned i = 0; i < vec.length (); ++i)
2828 vec[i] = saved[perm[i]];
2829 for (unsigned i = 0; i < vec.length (); ++i)
2830 gcc_assert (vec[i] == saved[perm[i]]);
2831 }
2832 }
2833
2834 /* Return whether permutations PERM_A and PERM_B as recorded in the
2835 PERMS vector are equal. */
2836
2837 static bool
2838 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2839 int perm_a, int perm_b)
2840 {
2841 return (perm_a == perm_b
2842 || (perms[perm_a].length () == perms[perm_b].length ()
2843 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2844 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2845 }
2846
2847 /* Optimize the SLP graph of VINFO. */
2848
2849 void
2850 vect_optimize_slp (vec_info *vinfo)
2851 {
2852 if (vinfo->slp_instances.is_empty ())
2853 return;
2854
2855 slp_tree node;
2856 unsigned i;
2857 auto_vec<slp_tree> vertices;
2858 auto_vec<int> leafs;
2859 vect_slp_build_vertices (vinfo, vertices, leafs);
2860
2861 struct graph *slpg = new_graph (vertices.length ());
2862 FOR_EACH_VEC_ELT (vertices, i, node)
2863 {
2864 unsigned j;
2865 slp_tree child;
2866 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2867 if (child)
2868 add_edge (slpg, i, child->vertex);
2869 }
2870
2871 /* Compute (reverse) postorder on the inverted graph. */
2872 auto_vec<int> ipo;
2873 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2874
2875 auto_sbitmap n_visited (vertices.length ());
2876 auto_sbitmap n_materialize (vertices.length ());
2877 auto_vec<int> n_perm (vertices.length ());
2878 auto_vec<vec<unsigned> > perms;
2879
2880 bitmap_clear (n_visited);
2881 bitmap_clear (n_materialize);
2882 n_perm.quick_grow_cleared (vertices.length ());
2883 perms.safe_push (vNULL); /* zero is no permute */
2884
2885 /* Produce initial permutations. */
2886 for (i = 0; i < leafs.length (); ++i)
2887 {
2888 int idx = leafs[i];
2889 slp_tree node = vertices[idx];
2890
2891 /* Handle externals and constants optimistically throughout the
2892 iteration. */
2893 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2894 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2895 continue;
2896
2897 /* Leafs do not change across iterations. Note leafs also double
2898 as entries to the reverse graph. */
2899 if (!slpg->vertices[idx].succ)
2900 bitmap_set_bit (n_visited, idx);
2901 /* Loads are the only thing generating permutes. */
2902 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2903 continue;
2904
2905 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2906 node unpermuted, record this permute. */
2907 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2908 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2909 continue;
2910 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2911 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2912 bool any_permute = false;
2913 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2914 {
2915 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2916 imin = MIN (imin, idx);
2917 imax = MAX (imax, idx);
2918 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2919 any_permute = true;
2920 }
2921 /* If there's no permute no need to split one out. */
2922 if (!any_permute)
2923 continue;
2924 /* If the span doesn't match we'd disrupt VF computation, avoid
2925 that for now. */
2926 if (imax - imin + 1 != SLP_TREE_LANES (node))
2927 continue;
2928
2929 /* For now only handle true permutes, like
2930 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2931 when permuting constants and invariants keeping the permute
2932 bijective. */
2933 auto_sbitmap load_index (SLP_TREE_LANES (node));
2934 bitmap_clear (load_index);
2935 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2936 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2937 unsigned j;
2938 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2939 if (!bitmap_bit_p (load_index, j))
2940 break;
2941 if (j != SLP_TREE_LANES (node))
2942 continue;
2943
2944 vec<unsigned> perm = vNULL;
2945 perm.safe_grow (SLP_TREE_LANES (node), true);
2946 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2947 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2948 perms.safe_push (perm);
2949 n_perm[idx] = perms.length () - 1;
2950 }
2951
2952 /* Propagate permutes along the graph and compute materialization points. */
2953 bool changed;
2954 unsigned iteration = 0;
2955 do
2956 {
2957 changed = false;
2958 ++iteration;
2959
2960 for (i = vertices.length (); i > 0 ; --i)
2961 {
2962 int idx = ipo[i-1];
2963 slp_tree node = vertices[idx];
2964 /* For leafs there's nothing to do - we've seeded permutes
2965 on those above. */
2966 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2967 continue;
2968
2969 bitmap_set_bit (n_visited, idx);
2970
2971 /* We cannot move a permute across a store. */
2972 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2973 && DR_IS_WRITE
2974 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2975 continue;
2976
2977 int perm = -1;
2978 for (graph_edge *succ = slpg->vertices[idx].succ;
2979 succ; succ = succ->succ_next)
2980 {
2981 int succ_idx = succ->dest;
2982 /* Handle unvisited nodes optimistically. */
2983 /* ??? But for constants once we want to handle non-bijective
2984 permutes we have to verify the permute, when unifying lanes,
2985 will not unify different constants. For example see
2986 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2987 if (!bitmap_bit_p (n_visited, succ_idx))
2988 continue;
2989 int succ_perm = n_perm[succ_idx];
2990 /* Once we materialize succs permutation its output lanes
2991 appear unpermuted to us. */
2992 if (bitmap_bit_p (n_materialize, succ_idx))
2993 succ_perm = 0;
2994 if (perm == -1)
2995 perm = succ_perm;
2996 else if (succ_perm == 0)
2997 {
2998 perm = 0;
2999 break;
3000 }
3001 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3002 {
3003 perm = 0;
3004 break;
3005 }
3006 }
3007
3008 if (perm == -1)
3009 /* Pick up pre-computed leaf values. */
3010 perm = n_perm[idx];
3011 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3012 {
3013 if (iteration > 1)
3014 /* Make sure we eventually converge. */
3015 gcc_checking_assert (perm == 0);
3016 n_perm[idx] = perm;
3017 if (perm == 0)
3018 bitmap_clear_bit (n_materialize, idx);
3019 changed = true;
3020 }
3021
3022 if (perm == 0)
3023 continue;
3024
3025 /* Elide pruning at materialization points in the first
3026 iteration so every node was visited once at least. */
3027 if (iteration == 1)
3028 continue;
3029
3030 /* Decide on permute materialization. Look whether there's
3031 a use (pred) edge that is permuted differently than us.
3032 In that case mark ourselves so the permutation is applied. */
3033 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3034 for (graph_edge *pred = slpg->vertices[idx].pred;
3035 pred; pred = pred->pred_next)
3036 {
3037 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3038 int pred_perm = n_perm[pred->src];
3039 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3040 {
3041 all_preds_permuted = false;
3042 break;
3043 }
3044 }
3045 if (!all_preds_permuted)
3046 {
3047 if (!bitmap_bit_p (n_materialize, idx))
3048 changed = true;
3049 bitmap_set_bit (n_materialize, idx);
3050 }
3051 }
3052 }
3053 while (changed || iteration == 1);
3054
3055 /* Materialize. */
3056 for (i = 0; i < vertices.length (); ++i)
3057 {
3058 int perm = n_perm[i];
3059 if (perm <= 0)
3060 continue;
3061
3062 slp_tree node = vertices[i];
3063
3064 /* First permute invariant/external original successors. */
3065 unsigned j;
3066 slp_tree child;
3067 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3068 {
3069 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3070 continue;
3071
3072 /* If the vector is uniform there's nothing to do. */
3073 if (vect_slp_tree_uniform_p (child))
3074 continue;
3075
3076 /* We can end up sharing some externals via two_operator
3077 handling. Be prepared to unshare those. */
3078 if (child->refcnt != 1)
3079 {
3080 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3081 SLP_TREE_CHILDREN (node)[j] = child
3082 = vect_create_new_slp_node
3083 (SLP_TREE_SCALAR_OPS (child).copy ());
3084 }
3085 vect_slp_permute (perms[perm],
3086 SLP_TREE_SCALAR_OPS (child), true);
3087 }
3088
3089 if (bitmap_bit_p (n_materialize, i))
3090 {
3091 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3092 /* For loads simply drop the permutation, the load permutation
3093 already performs the desired permutation. */
3094 ;
3095 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3096 {
3097 /* If the node if already a permute node we just need to apply
3098 the permutation to the permute node itself. */
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 "simplifying permute node %p\n",
3102 node);
3103
3104 vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
3105 true);
3106 }
3107 else
3108 {
3109 if (dump_enabled_p ())
3110 dump_printf_loc (MSG_NOTE, vect_location,
3111 "inserting permute node in place of %p\n",
3112 node);
3113
3114 /* Make a copy of NODE and in-place change it to a
3115 VEC_PERM node to permute the lanes of the copy. */
3116 slp_tree copy = new _slp_tree;
3117 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3118 SLP_TREE_CHILDREN (node) = vNULL;
3119 SLP_TREE_SCALAR_STMTS (copy)
3120 = SLP_TREE_SCALAR_STMTS (node).copy ();
3121 vect_slp_permute (perms[perm],
3122 SLP_TREE_SCALAR_STMTS (copy), true);
3123 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3124 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3125 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3126 SLP_TREE_LANE_PERMUTATION (copy)
3127 = SLP_TREE_LANE_PERMUTATION (node);
3128 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3129 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3130 copy->refcnt = 1;
3131 copy->max_nunits = node->max_nunits;
3132 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3133 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3134 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3135
3136 /* Now turn NODE into a VEC_PERM. */
3137 SLP_TREE_CHILDREN (node).safe_push (copy);
3138 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3139 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3140 SLP_TREE_LANE_PERMUTATION (node)
3141 .quick_push (std::make_pair (0, perms[perm][j]));
3142 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3143 }
3144 }
3145 else
3146 {
3147 /* Apply the reverse permutation to our stmts. */
3148 vect_slp_permute (perms[perm],
3149 SLP_TREE_SCALAR_STMTS (node), true);
3150 /* And to the load permutation, which we can simply
3151 make regular by design. */
3152 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3153 {
3154 /* ??? When we handle non-bijective permutes the idea
3155 is that we can force the load-permutation to be
3156 { min, min + 1, min + 2, ... max }. But then the
3157 scalar defs might no longer match the lane content
3158 which means wrong-code with live lane vectorization.
3159 So we possibly have to have NULL entries for those. */
3160 vect_slp_permute (perms[perm],
3161 SLP_TREE_LOAD_PERMUTATION (node), true);
3162 }
3163 }
3164 }
3165
3166 /* Free the perms vector used for propagation. */
3167 while (!perms.is_empty ())
3168 perms.pop ().release ();
3169 free_graph (slpg);
3170
3171
3172 /* Now elide load permutations that are not necessary. */
3173 for (i = 0; i < leafs.length (); ++i)
3174 {
3175 node = vertices[leafs[i]];
3176 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3177 continue;
3178
3179 /* In basic block vectorization we allow any subchain of an interleaving
3180 chain.
3181 FORNOW: not in loop SLP because of realignment complications. */
3182 if (is_a <bb_vec_info> (vinfo))
3183 {
3184 bool subchain_p = true;
3185 stmt_vec_info next_load_info = NULL;
3186 stmt_vec_info load_info;
3187 unsigned j;
3188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3189 {
3190 if (j != 0
3191 && (next_load_info != load_info
3192 || DR_GROUP_GAP (load_info) != 1))
3193 {
3194 subchain_p = false;
3195 break;
3196 }
3197 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3198 }
3199 if (subchain_p)
3200 {
3201 SLP_TREE_LOAD_PERMUTATION (node).release ();
3202 continue;
3203 }
3204 }
3205 else
3206 {
3207 stmt_vec_info load_info;
3208 bool this_load_permuted = false;
3209 unsigned j;
3210 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3211 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3212 {
3213 this_load_permuted = true;
3214 break;
3215 }
3216 stmt_vec_info first_stmt_info
3217 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3218 if (!this_load_permuted
3219 /* The load requires permutation when unrolling exposes
3220 a gap either because the group is larger than the SLP
3221 group-size or because there is a gap between the groups. */
3222 && (known_eq (LOOP_VINFO_VECT_FACTOR
3223 (as_a <loop_vec_info> (vinfo)), 1U)
3224 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3225 && DR_GROUP_GAP (first_stmt_info) == 0)))
3226 {
3227 SLP_TREE_LOAD_PERMUTATION (node).release ();
3228 continue;
3229 }
3230 }
3231 }
3232 }
3233
3234 /* Gather loads reachable from the individual SLP graph entries. */
3235
3236 void
3237 vect_gather_slp_loads (vec_info *vinfo)
3238 {
3239 unsigned i;
3240 slp_instance instance;
3241 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3242 {
3243 hash_set<slp_tree> visited;
3244 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3245 SLP_INSTANCE_TREE (instance), visited);
3246 }
3247 }
3248
3249
3250 /* For each possible SLP instance decide whether to SLP it and calculate overall
3251 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3252 least one instance. */
3253
3254 bool
3255 vect_make_slp_decision (loop_vec_info loop_vinfo)
3256 {
3257 unsigned int i;
3258 poly_uint64 unrolling_factor = 1;
3259 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3260 slp_instance instance;
3261 int decided_to_slp = 0;
3262
3263 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3264
3265 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3266 {
3267 /* FORNOW: SLP if you can. */
3268 /* All unroll factors have the form:
3269
3270 GET_MODE_SIZE (vinfo->vector_mode) * X
3271
3272 for some rational X, so they must have a common multiple. */
3273 unrolling_factor
3274 = force_common_multiple (unrolling_factor,
3275 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3276
3277 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3278 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3279 loop-based vectorization. Such stmts will be marked as HYBRID. */
3280 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3281 decided_to_slp++;
3282 }
3283
3284 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3285
3286 if (decided_to_slp && dump_enabled_p ())
3287 {
3288 dump_printf_loc (MSG_NOTE, vect_location,
3289 "Decided to SLP %d instances. Unrolling factor ",
3290 decided_to_slp);
3291 dump_dec (MSG_NOTE, unrolling_factor);
3292 dump_printf (MSG_NOTE, "\n");
3293 }
3294
3295 return (decided_to_slp > 0);
3296 }
3297
3298 /* Private data for vect_detect_hybrid_slp. */
3299 struct vdhs_data
3300 {
3301 loop_vec_info loop_vinfo;
3302 vec<stmt_vec_info> *worklist;
3303 };
3304
3305 /* Walker for walk_gimple_op. */
3306
3307 static tree
3308 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3309 {
3310 walk_stmt_info *wi = (walk_stmt_info *)data;
3311 vdhs_data *dat = (vdhs_data *)wi->info;
3312
3313 if (wi->is_lhs)
3314 return NULL_TREE;
3315
3316 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3317 if (!def_stmt_info)
3318 return NULL_TREE;
3319 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3320 if (PURE_SLP_STMT (def_stmt_info))
3321 {
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3324 def_stmt_info->stmt);
3325 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3326 dat->worklist->safe_push (def_stmt_info);
3327 }
3328
3329 return NULL_TREE;
3330 }
3331
3332 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3333 if so, otherwise pushing it to WORKLIST. */
3334
3335 static void
3336 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3337 vec<stmt_vec_info> &worklist,
3338 stmt_vec_info stmt_info)
3339 {
3340 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE, vect_location,
3342 "Processing hybrid candidate : %G", stmt_info->stmt);
3343 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3344 imm_use_iterator iter2;
3345 ssa_op_iter iter1;
3346 use_operand_p use_p;
3347 def_operand_p def_p;
3348 bool any_def = false;
3349 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3350 {
3351 any_def = true;
3352 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3353 {
3354 if (is_gimple_debug (USE_STMT (use_p)))
3355 continue;
3356 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3357 /* An out-of loop use means this is a loop_vect sink. */
3358 if (!use_info)
3359 {
3360 if (dump_enabled_p ())
3361 dump_printf_loc (MSG_NOTE, vect_location,
3362 "Found loop_vect sink: %G", stmt_info->stmt);
3363 worklist.safe_push (stmt_info);
3364 return;
3365 }
3366 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3367 {
3368 if (dump_enabled_p ())
3369 dump_printf_loc (MSG_NOTE, vect_location,
3370 "Found loop_vect use: %G", use_info->stmt);
3371 worklist.safe_push (stmt_info);
3372 return;
3373 }
3374 }
3375 }
3376 /* No def means this is a loo_vect sink. */
3377 if (!any_def)
3378 {
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "Found loop_vect sink: %G", stmt_info->stmt);
3382 worklist.safe_push (stmt_info);
3383 return;
3384 }
3385 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE, vect_location,
3387 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3388 STMT_SLP_TYPE (stmt_info) = pure_slp;
3389 }
3390
3391 /* Find stmts that must be both vectorized and SLPed. */
3392
3393 void
3394 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3395 {
3396 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3397
3398 /* All stmts participating in SLP are marked pure_slp, all other
3399 stmts are loop_vect.
3400 First collect all loop_vect stmts into a worklist.
3401 SLP patterns cause not all original scalar stmts to appear in
3402 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3403 Rectify this here and do a backward walk over the IL only considering
3404 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3405 mark them as pure_slp. */
3406 auto_vec<stmt_vec_info> worklist;
3407 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3408 {
3409 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3410 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3411 gsi_next (&gsi))
3412 {
3413 gphi *phi = gsi.phi ();
3414 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3415 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3416 maybe_push_to_hybrid_worklist (loop_vinfo,
3417 worklist, stmt_info);
3418 }
3419 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3420 gsi_prev (&gsi))
3421 {
3422 gimple *stmt = gsi_stmt (gsi);
3423 if (is_gimple_debug (stmt))
3424 continue;
3425 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3426 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3427 {
3428 for (gimple_stmt_iterator gsi2
3429 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3430 !gsi_end_p (gsi2); gsi_next (&gsi2))
3431 {
3432 stmt_vec_info patt_info
3433 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3434 if (!STMT_SLP_TYPE (patt_info)
3435 && STMT_VINFO_RELEVANT (patt_info))
3436 maybe_push_to_hybrid_worklist (loop_vinfo,
3437 worklist, patt_info);
3438 }
3439 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3440 }
3441 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3442 maybe_push_to_hybrid_worklist (loop_vinfo,
3443 worklist, stmt_info);
3444 }
3445 }
3446
3447 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3448 mark any SLP vectorized stmt as hybrid.
3449 ??? We're visiting def stmts N times (once for each non-SLP and
3450 once for each hybrid-SLP use). */
3451 walk_stmt_info wi;
3452 vdhs_data dat;
3453 dat.worklist = &worklist;
3454 dat.loop_vinfo = loop_vinfo;
3455 memset (&wi, 0, sizeof (wi));
3456 wi.info = (void *)&dat;
3457 while (!worklist.is_empty ())
3458 {
3459 stmt_vec_info stmt_info = worklist.pop ();
3460 /* Since SSA operands are not set up for pattern stmts we need
3461 to use walk_gimple_op. */
3462 wi.is_lhs = 0;
3463 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3464 }
3465 }
3466
3467
3468 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3469
3470 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3471 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3472 {
3473 for (unsigned i = 0; i < bbs.length (); ++i)
3474 {
3475 if (i != 0)
3476 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3477 gsi_next (&si))
3478 {
3479 gphi *phi = si.phi ();
3480 gimple_set_uid (phi, 0);
3481 add_stmt (phi);
3482 }
3483 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3484 !gsi_end_p (gsi); gsi_next (&gsi))
3485 {
3486 gimple *stmt = gsi_stmt (gsi);
3487 gimple_set_uid (stmt, 0);
3488 if (is_gimple_debug (stmt))
3489 continue;
3490 add_stmt (stmt);
3491 }
3492 }
3493 }
3494
3495
3496 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3497 stmts in the basic block. */
3498
3499 _bb_vec_info::~_bb_vec_info ()
3500 {
3501 /* Reset region marker. */
3502 for (unsigned i = 0; i < bbs.length (); ++i)
3503 {
3504 if (i != 0)
3505 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3506 gsi_next (&si))
3507 {
3508 gphi *phi = si.phi ();
3509 gimple_set_uid (phi, -1);
3510 }
3511 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3512 !gsi_end_p (gsi); gsi_next (&gsi))
3513 {
3514 gimple *stmt = gsi_stmt (gsi);
3515 gimple_set_uid (stmt, -1);
3516 }
3517 }
3518
3519 for (unsigned i = 0; i < roots.length (); ++i)
3520 roots[i].stmts.release ();
3521 roots.release ();
3522 }
3523
3524 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3525 given then that child nodes have already been processed, and that
3526 their def types currently match their SLP node's def type. */
3527
3528 static bool
3529 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3530 slp_instance node_instance,
3531 stmt_vector_for_cost *cost_vec)
3532 {
3533 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3534 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3535
3536 /* Calculate the number of vector statements to be created for the
3537 scalar stmts in this node. For SLP reductions it is equal to the
3538 number of vector statements in the children (which has already been
3539 calculated by the recursive call). Otherwise it is the number of
3540 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3541 VF divided by the number of elements in a vector. */
3542 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3543 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3544 {
3545 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3546 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3547 {
3548 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3549 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3550 break;
3551 }
3552 }
3553 else
3554 {
3555 poly_uint64 vf;
3556 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3557 vf = loop_vinfo->vectorization_factor;
3558 else
3559 vf = 1;
3560 unsigned int group_size = SLP_TREE_LANES (node);
3561 tree vectype = SLP_TREE_VECTYPE (node);
3562 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3563 = vect_get_num_vectors (vf * group_size, vectype);
3564 }
3565
3566 /* Handle purely internal nodes. */
3567 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3568 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3569
3570 if (is_a <bb_vec_info> (vinfo)
3571 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3572 {
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3575 "desired vector type conflicts with earlier one "
3576 "for %G", stmt_info->stmt);
3577 return false;
3578 }
3579
3580 bool dummy;
3581 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3582 node, node_instance, cost_vec);
3583 }
3584
3585 /* Try to build NODE from scalars, returning true on success.
3586 NODE_INSTANCE is the SLP instance that contains NODE. */
3587
3588 static bool
3589 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3590 slp_instance node_instance)
3591 {
3592 stmt_vec_info stmt_info;
3593 unsigned int i;
3594
3595 if (!is_a <bb_vec_info> (vinfo)
3596 || node == SLP_INSTANCE_TREE (node_instance)
3597 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3598 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3599 return false;
3600
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE, vect_location,
3603 "Building vector operands of %p from scalars instead\n", node);
3604
3605 /* Don't remove and free the child nodes here, since they could be
3606 referenced by other structures. The analysis and scheduling phases
3607 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3608 unsigned int group_size = SLP_TREE_LANES (node);
3609 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3610 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3611 SLP_TREE_LOAD_PERMUTATION (node).release ();
3612 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3613 {
3614 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3615 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3616 }
3617 return true;
3618 }
3619
3620 /* Compute the prologue cost for invariant or constant operands represented
3621 by NODE. */
3622
3623 static void
3624 vect_prologue_cost_for_slp (slp_tree node,
3625 stmt_vector_for_cost *cost_vec)
3626 {
3627 /* There's a special case of an existing vector, that costs nothing. */
3628 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3629 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3630 return;
3631 /* Without looking at the actual initializer a vector of
3632 constants can be implemented as load from the constant pool.
3633 When all elements are the same we can use a splat. */
3634 tree vectype = SLP_TREE_VECTYPE (node);
3635 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3636 unsigned num_vects_to_check;
3637 unsigned HOST_WIDE_INT const_nunits;
3638 unsigned nelt_limit;
3639 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3640 && ! multiple_p (const_nunits, group_size))
3641 {
3642 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3643 nelt_limit = const_nunits;
3644 }
3645 else
3646 {
3647 /* If either the vector has variable length or the vectors
3648 are composed of repeated whole groups we only need to
3649 cost construction once. All vectors will be the same. */
3650 num_vects_to_check = 1;
3651 nelt_limit = group_size;
3652 }
3653 tree elt = NULL_TREE;
3654 unsigned nelt = 0;
3655 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3656 {
3657 unsigned si = j % group_size;
3658 if (nelt == 0)
3659 elt = SLP_TREE_SCALAR_OPS (node)[si];
3660 /* ??? We're just tracking whether all operands of a single
3661 vector initializer are the same, ideally we'd check if
3662 we emitted the same one already. */
3663 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3664 elt = NULL_TREE;
3665 nelt++;
3666 if (nelt == nelt_limit)
3667 {
3668 record_stmt_cost (cost_vec, 1,
3669 SLP_TREE_DEF_TYPE (node) == vect_external_def
3670 ? (elt ? scalar_to_vec : vec_construct)
3671 : vector_load,
3672 NULL, vectype, 0, vect_prologue);
3673 nelt = 0;
3674 }
3675 }
3676 }
3677
3678 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3679 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3680
3681 Return true if the operations are supported. */
3682
3683 static bool
3684 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3685 slp_instance node_instance,
3686 hash_set<slp_tree> &visited_set,
3687 vec<slp_tree> &visited_vec,
3688 stmt_vector_for_cost *cost_vec)
3689 {
3690 int i, j;
3691 slp_tree child;
3692
3693 /* Assume we can code-generate all invariants. */
3694 if (!node
3695 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3696 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3697 return true;
3698
3699 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3700 {
3701 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_NOTE, vect_location,
3703 "Failed cyclic SLP reference in %p", node);
3704 return false;
3705 }
3706 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3707
3708 /* If we already analyzed the exact same set of scalar stmts we're done.
3709 We share the generated vector stmts for those. */
3710 if (visited_set.add (node))
3711 return true;
3712 visited_vec.safe_push (node);
3713
3714 bool res = true;
3715 unsigned visited_rec_start = visited_vec.length ();
3716 unsigned cost_vec_rec_start = cost_vec->length ();
3717 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3718 {
3719 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3720 visited_set, visited_vec,
3721 cost_vec);
3722 if (!res)
3723 break;
3724 }
3725
3726 if (res)
3727 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3728 cost_vec);
3729 /* If analysis failed we have to pop all recursive visited nodes
3730 plus ourselves. */
3731 if (!res)
3732 {
3733 while (visited_vec.length () >= visited_rec_start)
3734 visited_set.remove (visited_vec.pop ());
3735 cost_vec->truncate (cost_vec_rec_start);
3736 }
3737
3738 /* When the node can be vectorized cost invariant nodes it references.
3739 This is not done in DFS order to allow the refering node
3740 vectorizable_* calls to nail down the invariant nodes vector type
3741 and possibly unshare it if it needs a different vector type than
3742 other referrers. */
3743 if (res)
3744 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3745 if (child
3746 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3747 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3748 /* Perform usual caching, note code-generation still
3749 code-gens these nodes multiple times but we expect
3750 to CSE them later. */
3751 && !visited_set.add (child))
3752 {
3753 visited_vec.safe_push (child);
3754 /* ??? After auditing more code paths make a "default"
3755 and push the vector type from NODE to all children
3756 if it is not already set. */
3757 /* Compute the number of vectors to be generated. */
3758 tree vector_type = SLP_TREE_VECTYPE (child);
3759 if (!vector_type)
3760 {
3761 /* For shifts with a scalar argument we don't need
3762 to cost or code-generate anything.
3763 ??? Represent this more explicitely. */
3764 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3765 == shift_vec_info_type)
3766 && j == 1);
3767 continue;
3768 }
3769 unsigned group_size = SLP_TREE_LANES (child);
3770 poly_uint64 vf = 1;
3771 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3772 vf = loop_vinfo->vectorization_factor;
3773 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3774 = vect_get_num_vectors (vf * group_size, vector_type);
3775 /* And cost them. */
3776 vect_prologue_cost_for_slp (child, cost_vec);
3777 }
3778
3779 /* If this node or any of its children can't be vectorized, try pruning
3780 the tree here rather than felling the whole thing. */
3781 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3782 {
3783 /* We'll need to revisit this for invariant costing and number
3784 of vectorized stmt setting. */
3785 res = true;
3786 }
3787
3788 return res;
3789 }
3790
3791
3792 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3793 region and that can be vectorized using vectorizable_live_operation
3794 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3795 scalar code computing it to be retained. */
3796
3797 static void
3798 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3799 slp_instance instance,
3800 stmt_vector_for_cost *cost_vec,
3801 hash_set<stmt_vec_info> &svisited,
3802 hash_set<slp_tree> &visited)
3803 {
3804 if (visited.add (node))
3805 return;
3806
3807 unsigned i;
3808 stmt_vec_info stmt_info;
3809 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3810 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3811 {
3812 if (svisited.contains (stmt_info))
3813 continue;
3814 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3815 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3816 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3817 /* Only the pattern root stmt computes the original scalar value. */
3818 continue;
3819 bool mark_visited = true;
3820 gimple *orig_stmt = orig_stmt_info->stmt;
3821 ssa_op_iter op_iter;
3822 def_operand_p def_p;
3823 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3824 {
3825 imm_use_iterator use_iter;
3826 gimple *use_stmt;
3827 stmt_vec_info use_stmt_info;
3828 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3829 if (!is_gimple_debug (use_stmt))
3830 {
3831 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3832 if (!use_stmt_info
3833 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3834 {
3835 STMT_VINFO_LIVE_P (stmt_info) = true;
3836 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3837 NULL, node, instance, i,
3838 false, cost_vec))
3839 /* ??? So we know we can vectorize the live stmt
3840 from one SLP node. If we cannot do so from all
3841 or none consistently we'd have to record which
3842 SLP node (and lane) we want to use for the live
3843 operation. So make sure we can code-generate
3844 from all nodes. */
3845 mark_visited = false;
3846 else
3847 STMT_VINFO_LIVE_P (stmt_info) = false;
3848 BREAK_FROM_IMM_USE_STMT (use_iter);
3849 }
3850 }
3851 /* We have to verify whether we can insert the lane extract
3852 before all uses. The following is a conservative approximation.
3853 We cannot put this into vectorizable_live_operation because
3854 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3855 doesn't work.
3856 Note that while the fact that we emit code for loads at the
3857 first load should make this a non-problem leafs we construct
3858 from scalars are vectorized after the last scalar def.
3859 ??? If we'd actually compute the insert location during
3860 analysis we could use sth less conservative than the last
3861 scalar stmt in the node for the dominance check. */
3862 /* ??? What remains is "live" uses in vector CTORs in the same
3863 SLP graph which is where those uses can end up code-generated
3864 right after their definition instead of close to their original
3865 use. But that would restrict us to code-generate lane-extracts
3866 from the latest stmt in a node. So we compensate for this
3867 during code-generation, simply not replacing uses for those
3868 hopefully rare cases. */
3869 if (STMT_VINFO_LIVE_P (stmt_info))
3870 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3871 if (!is_gimple_debug (use_stmt)
3872 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3873 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3874 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3875 {
3876 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3878 "Cannot determine insertion place for "
3879 "lane extract\n");
3880 STMT_VINFO_LIVE_P (stmt_info) = false;
3881 mark_visited = true;
3882 }
3883 }
3884 if (mark_visited)
3885 svisited.add (stmt_info);
3886 }
3887
3888 slp_tree child;
3889 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3890 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3891 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3892 cost_vec, svisited, visited);
3893 }
3894
3895 /* Analyze statements in SLP instances of VINFO. Return true if the
3896 operations are supported. */
3897
3898 bool
3899 vect_slp_analyze_operations (vec_info *vinfo)
3900 {
3901 slp_instance instance;
3902 int i;
3903
3904 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3905
3906 hash_set<slp_tree> visited;
3907 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3908 {
3909 auto_vec<slp_tree> visited_vec;
3910 stmt_vector_for_cost cost_vec;
3911 cost_vec.create (2);
3912 if (is_a <bb_vec_info> (vinfo))
3913 vect_location = instance->location ();
3914 if (!vect_slp_analyze_node_operations (vinfo,
3915 SLP_INSTANCE_TREE (instance),
3916 instance, visited, visited_vec,
3917 &cost_vec)
3918 /* Instances with a root stmt require vectorized defs for the
3919 SLP tree root. */
3920 || (SLP_INSTANCE_ROOT_STMT (instance)
3921 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3922 != vect_internal_def)))
3923 {
3924 slp_tree node = SLP_INSTANCE_TREE (instance);
3925 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3926 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_NOTE, vect_location,
3928 "removing SLP instance operations starting from: %G",
3929 stmt_info->stmt);
3930 vect_free_slp_instance (instance);
3931 vinfo->slp_instances.ordered_remove (i);
3932 cost_vec.release ();
3933 while (!visited_vec.is_empty ())
3934 visited.remove (visited_vec.pop ());
3935 }
3936 else
3937 {
3938 i++;
3939
3940 /* For BB vectorization remember the SLP graph entry
3941 cost for later. */
3942 if (is_a <bb_vec_info> (vinfo))
3943 instance->cost_vec = cost_vec;
3944 else
3945 {
3946 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3947 cost_vec.release ();
3948 }
3949 }
3950 }
3951
3952 /* Compute vectorizable live stmts. */
3953 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3954 {
3955 hash_set<stmt_vec_info> svisited;
3956 hash_set<slp_tree> visited;
3957 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3958 {
3959 vect_location = instance->location ();
3960 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3961 instance, &instance->cost_vec, svisited,
3962 visited);
3963 }
3964 }
3965
3966 return !vinfo->slp_instances.is_empty ();
3967 }
3968
3969 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3970 closing the eventual chain. */
3971
3972 static slp_instance
3973 get_ultimate_leader (slp_instance instance,
3974 hash_map<slp_instance, slp_instance> &instance_leader)
3975 {
3976 auto_vec<slp_instance *, 8> chain;
3977 slp_instance *tem;
3978 while (*(tem = instance_leader.get (instance)) != instance)
3979 {
3980 chain.safe_push (tem);
3981 instance = *tem;
3982 }
3983 while (!chain.is_empty ())
3984 *chain.pop () = instance;
3985 return instance;
3986 }
3987
3988 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3989
3990 static void
3991 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3992 slp_instance instance, slp_tree node,
3993 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3994 hash_map<slp_instance, slp_instance> &instance_leader,
3995 hash_set<slp_tree> &visited)
3996 {
3997 stmt_vec_info stmt_info;
3998 unsigned i;
3999
4000 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4001 {
4002 bool existed_p;
4003 slp_instance &stmt_instance
4004 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4005 if (!existed_p)
4006 ;
4007 else if (stmt_instance != instance)
4008 {
4009 /* If we're running into a previously marked stmt make us the
4010 leader of the current ultimate leader. This keeps the
4011 leader chain acyclic and works even when the current instance
4012 connects two previously independent graph parts. */
4013 slp_instance stmt_leader
4014 = get_ultimate_leader (stmt_instance, instance_leader);
4015 if (stmt_leader != instance)
4016 instance_leader.put (stmt_leader, instance);
4017 }
4018 stmt_instance = instance;
4019 }
4020
4021 if (visited.add (node))
4022 return;
4023
4024 slp_tree child;
4025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4026 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4027 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4028 instance_leader, visited);
4029 }
4030
4031 /* Partition the SLP graph into pieces that can be costed independently. */
4032
4033 static void
4034 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4035 {
4036 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4037
4038 /* First walk the SLP graph assigning each involved scalar stmt a
4039 corresponding SLP graph entry and upon visiting a previously
4040 marked stmt, make the stmts leader the current SLP graph entry. */
4041 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4042 hash_map<slp_instance, slp_instance> instance_leader;
4043 hash_set<slp_tree> visited;
4044 slp_instance instance;
4045 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4046 {
4047 instance_leader.put (instance, instance);
4048 vect_bb_partition_graph_r (bb_vinfo,
4049 instance, SLP_INSTANCE_TREE (instance),
4050 stmt_to_instance, instance_leader,
4051 visited);
4052 }
4053
4054 /* Then collect entries to each independent subgraph. */
4055 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4056 {
4057 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4058 leader->subgraph_entries.safe_push (instance);
4059 if (dump_enabled_p ()
4060 && leader != instance)
4061 dump_printf_loc (MSG_NOTE, vect_location,
4062 "instance %p is leader of %p\n",
4063 leader, instance);
4064 }
4065 }
4066
4067 /* Compute the scalar cost of the SLP node NODE and its children
4068 and return it. Do not account defs that are marked in LIFE and
4069 update LIFE according to uses of NODE. */
4070
4071 static void
4072 vect_bb_slp_scalar_cost (vec_info *vinfo,
4073 slp_tree node, vec<bool, va_heap> *life,
4074 stmt_vector_for_cost *cost_vec,
4075 hash_set<slp_tree> &visited)
4076 {
4077 unsigned i;
4078 stmt_vec_info stmt_info;
4079 slp_tree child;
4080
4081 if (visited.add (node))
4082 return;
4083
4084 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4085 {
4086 ssa_op_iter op_iter;
4087 def_operand_p def_p;
4088
4089 if ((*life)[i])
4090 continue;
4091
4092 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4093 gimple *orig_stmt = orig_stmt_info->stmt;
4094
4095 /* If there is a non-vectorized use of the defs then the scalar
4096 stmt is kept live in which case we do not account it or any
4097 required defs in the SLP children in the scalar cost. This
4098 way we make the vectorization more costly when compared to
4099 the scalar cost. */
4100 if (!STMT_VINFO_LIVE_P (stmt_info))
4101 {
4102 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4103 {
4104 imm_use_iterator use_iter;
4105 gimple *use_stmt;
4106 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4107 if (!is_gimple_debug (use_stmt))
4108 {
4109 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4110 if (!use_stmt_info
4111 || !PURE_SLP_STMT
4112 (vect_stmt_to_vectorize (use_stmt_info)))
4113 {
4114 (*life)[i] = true;
4115 BREAK_FROM_IMM_USE_STMT (use_iter);
4116 }
4117 }
4118 }
4119 if ((*life)[i])
4120 continue;
4121 }
4122
4123 /* Count scalar stmts only once. */
4124 if (gimple_visited_p (orig_stmt))
4125 continue;
4126 gimple_set_visited (orig_stmt, true);
4127
4128 vect_cost_for_stmt kind;
4129 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4130 {
4131 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4132 kind = scalar_load;
4133 else
4134 kind = scalar_store;
4135 }
4136 else if (vect_nop_conversion_p (orig_stmt_info))
4137 continue;
4138 else
4139 kind = scalar_stmt;
4140 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4141 SLP_TREE_VECTYPE (node), 0, vect_body);
4142 }
4143
4144 auto_vec<bool, 20> subtree_life;
4145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4146 {
4147 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4148 {
4149 /* Do not directly pass LIFE to the recursive call, copy it to
4150 confine changes in the callee to the current child/subtree. */
4151 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4152 {
4153 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4154 for (unsigned j = 0;
4155 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4156 {
4157 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4158 if (perm.first == i)
4159 subtree_life[perm.second] = (*life)[j];
4160 }
4161 }
4162 else
4163 {
4164 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4165 subtree_life.safe_splice (*life);
4166 }
4167 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4168 visited);
4169 subtree_life.truncate (0);
4170 }
4171 }
4172 }
4173
4174 /* Check if vectorization of the basic block is profitable for the
4175 subgraph denoted by SLP_INSTANCES. */
4176
4177 static bool
4178 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4179 vec<slp_instance> slp_instances)
4180 {
4181 slp_instance instance;
4182 int i;
4183 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4184 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4185
4186 void *vect_target_cost_data = init_cost (NULL);
4187
4188 /* Calculate scalar cost and sum the cost for the vector stmts
4189 previously collected. */
4190 stmt_vector_for_cost scalar_costs;
4191 scalar_costs.create (0);
4192 hash_set<slp_tree> visited;
4193 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4194 {
4195 auto_vec<bool, 20> life;
4196 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4197 true);
4198 vect_bb_slp_scalar_cost (bb_vinfo,
4199 SLP_INSTANCE_TREE (instance),
4200 &life, &scalar_costs, visited);
4201 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4202 instance->cost_vec.release ();
4203 }
4204 /* Unset visited flag. */
4205 stmt_info_for_cost *si;
4206 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4207 gimple_set_visited (si->stmt_info->stmt, false);
4208
4209 void *scalar_target_cost_data = init_cost (NULL);
4210 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4211 scalar_costs.release ();
4212 unsigned dummy;
4213 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4214 destroy_cost_data (scalar_target_cost_data);
4215
4216 /* Complete the target-specific vector cost calculation. */
4217 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4218 &vec_inside_cost, &vec_epilogue_cost);
4219 destroy_cost_data (vect_target_cost_data);
4220
4221 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4222
4223 if (dump_enabled_p ())
4224 {
4225 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4226 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4227 vec_inside_cost);
4228 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4229 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4230 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4231 }
4232
4233 /* Vectorization is profitable if its cost is more than the cost of scalar
4234 version. Note that we err on the vector side for equal cost because
4235 the cost estimate is otherwise quite pessimistic (constant uses are
4236 free on the scalar side but cost a load on the vector side for
4237 example). */
4238 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4239 return false;
4240
4241 return true;
4242 }
4243
4244 /* qsort comparator for lane defs. */
4245
4246 static int
4247 vld_cmp (const void *a_, const void *b_)
4248 {
4249 auto *a = (const std::pair<unsigned, tree> *)a_;
4250 auto *b = (const std::pair<unsigned, tree> *)b_;
4251 return a->first - b->first;
4252 }
4253
4254 /* Return true if USE_STMT is a vector lane insert into VEC and set
4255 *THIS_LANE to the lane number that is set. */
4256
4257 static bool
4258 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4259 {
4260 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4261 if (!use_ass
4262 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4263 || (vec
4264 ? gimple_assign_rhs1 (use_ass) != vec
4265 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4266 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4267 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4268 || !constant_multiple_p
4269 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4270 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4271 this_lane))
4272 return false;
4273 return true;
4274 }
4275
4276 /* Find any vectorizable constructors and add them to the grouped_store
4277 array. */
4278
4279 static void
4280 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4281 {
4282 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4283 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4284 !gsi_end_p (gsi); gsi_next (&gsi))
4285 {
4286 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4287 if (!assign)
4288 continue;
4289
4290 tree rhs = gimple_assign_rhs1 (assign);
4291 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4292 {
4293 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4294 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4295 CONSTRUCTOR_NELTS (rhs))
4296 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4297 || uniform_vector_p (rhs))
4298 continue;
4299
4300 unsigned j;
4301 tree val;
4302 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4303 if (TREE_CODE (val) != SSA_NAME
4304 || !bb_vinfo->lookup_def (val))
4305 break;
4306 if (j != CONSTRUCTOR_NELTS (rhs))
4307 continue;
4308
4309 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4310 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4311 }
4312 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4313 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4314 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4315 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4316 && integer_zerop (gimple_assign_rhs3 (assign))
4317 && useless_type_conversion_p
4318 (TREE_TYPE (TREE_TYPE (rhs)),
4319 TREE_TYPE (gimple_assign_rhs2 (assign)))
4320 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4321 {
4322 /* We start to match on insert to lane zero but since the
4323 inserts need not be ordered we'd have to search both
4324 the def and the use chains. */
4325 tree vectype = TREE_TYPE (rhs);
4326 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4327 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4328 auto_sbitmap lanes (nlanes);
4329 bitmap_clear (lanes);
4330 bitmap_set_bit (lanes, 0);
4331 tree def = gimple_assign_lhs (assign);
4332 lane_defs.quick_push
4333 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4334 unsigned lanes_found = 1;
4335 /* Start with the use chains, the last stmt will be the root. */
4336 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4337 do
4338 {
4339 use_operand_p use_p;
4340 gimple *use_stmt;
4341 if (!single_imm_use (def, &use_p, &use_stmt))
4342 break;
4343 unsigned this_lane;
4344 if (!bb_vinfo->lookup_stmt (use_stmt)
4345 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4346 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4347 break;
4348 if (bitmap_bit_p (lanes, this_lane))
4349 break;
4350 lanes_found++;
4351 bitmap_set_bit (lanes, this_lane);
4352 gassign *use_ass = as_a <gassign *> (use_stmt);
4353 lane_defs.quick_push (std::make_pair
4354 (this_lane, gimple_assign_rhs2 (use_ass)));
4355 last = bb_vinfo->lookup_stmt (use_ass);
4356 def = gimple_assign_lhs (use_ass);
4357 }
4358 while (lanes_found < nlanes);
4359 if (lanes_found < nlanes)
4360 {
4361 /* Now search the def chain. */
4362 def = gimple_assign_rhs1 (assign);
4363 do
4364 {
4365 if (TREE_CODE (def) != SSA_NAME
4366 || !has_single_use (def))
4367 break;
4368 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4369 unsigned this_lane;
4370 if (!bb_vinfo->lookup_stmt (def_stmt)
4371 || !vect_slp_is_lane_insert (def_stmt,
4372 NULL_TREE, &this_lane)
4373 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4374 break;
4375 if (bitmap_bit_p (lanes, this_lane))
4376 break;
4377 lanes_found++;
4378 bitmap_set_bit (lanes, this_lane);
4379 lane_defs.quick_push (std::make_pair
4380 (this_lane,
4381 gimple_assign_rhs2 (def_stmt)));
4382 def = gimple_assign_rhs1 (def_stmt);
4383 }
4384 while (lanes_found < nlanes);
4385 }
4386 if (lanes_found == nlanes)
4387 {
4388 /* Sort lane_defs after the lane index and register the root. */
4389 lane_defs.qsort (vld_cmp);
4390 vec<stmt_vec_info> stmts;
4391 stmts.create (nlanes);
4392 for (unsigned i = 0; i < nlanes; ++i)
4393 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4394 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4395 stmts, last));
4396 }
4397 }
4398 }
4399 }
4400
4401 /* Walk the grouped store chains and replace entries with their
4402 pattern variant if any. */
4403
4404 static void
4405 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4406 {
4407 stmt_vec_info first_element;
4408 unsigned i;
4409
4410 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4411 {
4412 /* We also have CTORs in this array. */
4413 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4414 continue;
4415 if (STMT_VINFO_IN_PATTERN_P (first_element))
4416 {
4417 stmt_vec_info orig = first_element;
4418 first_element = STMT_VINFO_RELATED_STMT (first_element);
4419 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4420 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4421 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4422 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4423 vinfo->grouped_stores[i] = first_element;
4424 }
4425 stmt_vec_info prev = first_element;
4426 while (DR_GROUP_NEXT_ELEMENT (prev))
4427 {
4428 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4429 if (STMT_VINFO_IN_PATTERN_P (elt))
4430 {
4431 stmt_vec_info orig = elt;
4432 elt = STMT_VINFO_RELATED_STMT (elt);
4433 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4434 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4435 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4436 }
4437 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4438 prev = elt;
4439 }
4440 }
4441 }
4442
4443 /* Check if the region described by BB_VINFO can be vectorized, returning
4444 true if so. When returning false, set FATAL to true if the same failure
4445 would prevent vectorization at other vector sizes, false if it is still
4446 worth trying other sizes. N_STMTS is the number of statements in the
4447 region. */
4448
4449 static bool
4450 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4451 vec<int> *dataref_groups)
4452 {
4453 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4454
4455 slp_instance instance;
4456 int i;
4457 poly_uint64 min_vf = 2;
4458
4459 /* The first group of checks is independent of the vector size. */
4460 fatal = true;
4461
4462 /* Analyze the data references. */
4463
4464 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4465 {
4466 if (dump_enabled_p ())
4467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4468 "not vectorized: unhandled data-ref in basic "
4469 "block.\n");
4470 return false;
4471 }
4472
4473 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4474 {
4475 if (dump_enabled_p ())
4476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4477 "not vectorized: unhandled data access in "
4478 "basic block.\n");
4479 return false;
4480 }
4481
4482 vect_slp_check_for_constructors (bb_vinfo);
4483
4484 /* If there are no grouped stores and no constructors in the region
4485 there is no need to continue with pattern recog as vect_analyze_slp
4486 will fail anyway. */
4487 if (bb_vinfo->grouped_stores.is_empty ()
4488 && bb_vinfo->roots.is_empty ())
4489 {
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4492 "not vectorized: no grouped stores in "
4493 "basic block.\n");
4494 return false;
4495 }
4496
4497 /* While the rest of the analysis below depends on it in some way. */
4498 fatal = false;
4499
4500 vect_pattern_recog (bb_vinfo);
4501
4502 /* Update store groups from pattern processing. */
4503 vect_fixup_store_groups_with_patterns (bb_vinfo);
4504
4505 /* Check the SLP opportunities in the basic block, analyze and build SLP
4506 trees. */
4507 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4508 {
4509 if (dump_enabled_p ())
4510 {
4511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4512 "Failed to SLP the basic block.\n");
4513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4514 "not vectorized: failed to find SLP opportunities "
4515 "in basic block.\n");
4516 }
4517 return false;
4518 }
4519
4520 /* Optimize permutations. */
4521 vect_optimize_slp (bb_vinfo);
4522
4523 /* Gather the loads reachable from the SLP graph entries. */
4524 vect_gather_slp_loads (bb_vinfo);
4525
4526 vect_record_base_alignments (bb_vinfo);
4527
4528 /* Analyze and verify the alignment of data references and the
4529 dependence in the SLP instances. */
4530 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4531 {
4532 vect_location = instance->location ();
4533 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4534 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4535 {
4536 slp_tree node = SLP_INSTANCE_TREE (instance);
4537 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4538 if (dump_enabled_p ())
4539 dump_printf_loc (MSG_NOTE, vect_location,
4540 "removing SLP instance operations starting from: %G",
4541 stmt_info->stmt);
4542 vect_free_slp_instance (instance);
4543 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4544 continue;
4545 }
4546
4547 /* Mark all the statements that we want to vectorize as pure SLP and
4548 relevant. */
4549 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4550 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4551 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4552 {
4553 STMT_SLP_TYPE (root) = pure_slp;
4554 if (is_gimple_assign (root->stmt)
4555 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4556 {
4557 /* ??? We should probably record the whole vector of
4558 root stmts so we do not have to back-track here... */
4559 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4560 n != 1; --n)
4561 {
4562 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4563 STMT_SLP_TYPE (root) = pure_slp;
4564 }
4565 }
4566 }
4567
4568 i++;
4569 }
4570 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4571 return false;
4572
4573 if (!vect_slp_analyze_operations (bb_vinfo))
4574 {
4575 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4577 "not vectorized: bad operation in basic block.\n");
4578 return false;
4579 }
4580
4581 vect_bb_partition_graph (bb_vinfo);
4582
4583 return true;
4584 }
4585
4586 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4587 basic blocks in BBS, returning true on success.
4588 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4589
4590 static bool
4591 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4592 vec<int> *dataref_groups, unsigned int n_stmts)
4593 {
4594 bb_vec_info bb_vinfo;
4595 auto_vector_modes vector_modes;
4596
4597 /* Autodetect first vector size we try. */
4598 machine_mode next_vector_mode = VOIDmode;
4599 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4600 unsigned int mode_i = 0;
4601
4602 vec_info_shared shared;
4603
4604 machine_mode autodetected_vector_mode = VOIDmode;
4605 while (1)
4606 {
4607 bool vectorized = false;
4608 bool fatal = false;
4609 bb_vinfo = new _bb_vec_info (bbs, &shared);
4610
4611 bool first_time_p = shared.datarefs.is_empty ();
4612 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4613 if (first_time_p)
4614 bb_vinfo->shared->save_datarefs ();
4615 else
4616 bb_vinfo->shared->check_datarefs ();
4617 bb_vinfo->vector_mode = next_vector_mode;
4618
4619 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4620 && dbg_cnt (vect_slp))
4621 {
4622 if (dump_enabled_p ())
4623 {
4624 dump_printf_loc (MSG_NOTE, vect_location,
4625 "***** Analysis succeeded with vector mode"
4626 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4627 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4628 }
4629
4630 bb_vinfo->shared->check_datarefs ();
4631
4632 unsigned i;
4633 slp_instance instance;
4634 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4635 {
4636 if (instance->subgraph_entries.is_empty ())
4637 continue;
4638
4639 vect_location = instance->location ();
4640 if (!unlimited_cost_model (NULL)
4641 && !vect_bb_vectorization_profitable_p
4642 (bb_vinfo, instance->subgraph_entries))
4643 {
4644 if (dump_enabled_p ())
4645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4646 "not vectorized: vectorization is not "
4647 "profitable.\n");
4648 continue;
4649 }
4650
4651 if (!vectorized && dump_enabled_p ())
4652 dump_printf_loc (MSG_NOTE, vect_location,
4653 "Basic block will be vectorized "
4654 "using SLP\n");
4655 vectorized = true;
4656
4657 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4658
4659 unsigned HOST_WIDE_INT bytes;
4660 if (dump_enabled_p ())
4661 {
4662 if (GET_MODE_SIZE
4663 (bb_vinfo->vector_mode).is_constant (&bytes))
4664 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4665 "basic block part vectorized using %wu "
4666 "byte vectors\n", bytes);
4667 else
4668 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4669 "basic block part vectorized using "
4670 "variable length vectors\n");
4671 }
4672 }
4673 }
4674 else
4675 {
4676 if (dump_enabled_p ())
4677 dump_printf_loc (MSG_NOTE, vect_location,
4678 "***** Analysis failed with vector mode %s\n",
4679 GET_MODE_NAME (bb_vinfo->vector_mode));
4680 }
4681
4682 if (mode_i == 0)
4683 autodetected_vector_mode = bb_vinfo->vector_mode;
4684
4685 if (!fatal)
4686 while (mode_i < vector_modes.length ()
4687 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4688 {
4689 if (dump_enabled_p ())
4690 dump_printf_loc (MSG_NOTE, vect_location,
4691 "***** The result for vector mode %s would"
4692 " be the same\n",
4693 GET_MODE_NAME (vector_modes[mode_i]));
4694 mode_i += 1;
4695 }
4696
4697 delete bb_vinfo;
4698
4699 if (mode_i < vector_modes.length ()
4700 && VECTOR_MODE_P (autodetected_vector_mode)
4701 && (related_vector_mode (vector_modes[mode_i],
4702 GET_MODE_INNER (autodetected_vector_mode))
4703 == autodetected_vector_mode)
4704 && (related_vector_mode (autodetected_vector_mode,
4705 GET_MODE_INNER (vector_modes[mode_i]))
4706 == vector_modes[mode_i]))
4707 {
4708 if (dump_enabled_p ())
4709 dump_printf_loc (MSG_NOTE, vect_location,
4710 "***** Skipping vector mode %s, which would"
4711 " repeat the analysis for %s\n",
4712 GET_MODE_NAME (vector_modes[mode_i]),
4713 GET_MODE_NAME (autodetected_vector_mode));
4714 mode_i += 1;
4715 }
4716
4717 if (vectorized
4718 || mode_i == vector_modes.length ()
4719 || autodetected_vector_mode == VOIDmode
4720 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4721 vector sizes will fail do not bother iterating. */
4722 || fatal)
4723 return vectorized;
4724
4725 /* Try the next biggest vector size. */
4726 next_vector_mode = vector_modes[mode_i++];
4727 if (dump_enabled_p ())
4728 dump_printf_loc (MSG_NOTE, vect_location,
4729 "***** Re-trying analysis with vector mode %s\n",
4730 GET_MODE_NAME (next_vector_mode));
4731 }
4732 }
4733
4734
4735 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4736 true if anything in the basic-block was vectorized. */
4737
4738 static bool
4739 vect_slp_bbs (vec<basic_block> bbs)
4740 {
4741 vec<data_reference_p> datarefs = vNULL;
4742 auto_vec<int> dataref_groups;
4743 int insns = 0;
4744 int current_group = 0;
4745
4746 for (unsigned i = 0; i < bbs.length (); i++)
4747 {
4748 basic_block bb = bbs[i];
4749 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4750 gsi_next (&gsi))
4751 {
4752 gimple *stmt = gsi_stmt (gsi);
4753 if (is_gimple_debug (stmt))
4754 continue;
4755
4756 insns++;
4757
4758 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4759 vect_location = stmt;
4760
4761 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4762 &dataref_groups, current_group))
4763 ++current_group;
4764 }
4765 }
4766
4767 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4768 }
4769
4770 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4771 true if anything in the basic-block was vectorized. */
4772
4773 bool
4774 vect_slp_bb (basic_block bb)
4775 {
4776 auto_vec<basic_block> bbs;
4777 bbs.safe_push (bb);
4778 return vect_slp_bbs (bbs);
4779 }
4780
4781 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4782 true if anything in the basic-block was vectorized. */
4783
4784 bool
4785 vect_slp_function (function *fun)
4786 {
4787 bool r = false;
4788 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4789 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4790
4791 /* For the moment split the function into pieces to avoid making
4792 the iteration on the vector mode moot. Split at points we know
4793 to not handle well which is CFG merges (SLP discovery doesn't
4794 handle non-loop-header PHIs) and loop exits. Since pattern
4795 recog requires reverse iteration to visit uses before defs
4796 simply chop RPO into pieces. */
4797 auto_vec<basic_block> bbs;
4798 for (unsigned i = 0; i < n; i++)
4799 {
4800 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4801 bool split = false;
4802
4803 /* Split when a BB is not dominated by the first block. */
4804 if (!bbs.is_empty ()
4805 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4806 {
4807 if (dump_enabled_p ())
4808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4809 "splitting region at dominance boundary bb%d\n",
4810 bb->index);
4811 split = true;
4812 }
4813 /* Split when the loop determined by the first block
4814 is exited. This is because we eventually insert
4815 invariants at region begin. */
4816 else if (!bbs.is_empty ()
4817 && bbs[0]->loop_father != bb->loop_father
4818 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4819 {
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4822 "splitting region at loop %d exit at bb%d\n",
4823 bbs[0]->loop_father->num, bb->index);
4824 split = true;
4825 }
4826
4827 if (split && !bbs.is_empty ())
4828 {
4829 r |= vect_slp_bbs (bbs);
4830 bbs.truncate (0);
4831 bbs.quick_push (bb);
4832 }
4833 else
4834 bbs.safe_push (bb);
4835
4836 /* When we have a stmt ending this block and defining a
4837 value we have to insert on edges when inserting after it for
4838 a vector containing its definition. Avoid this for now. */
4839 if (gimple *last = last_stmt (bb))
4840 if (gimple_get_lhs (last)
4841 && is_ctrl_altering_stmt (last))
4842 {
4843 if (dump_enabled_p ())
4844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4845 "splitting region at control altering "
4846 "definition %G", last);
4847 r |= vect_slp_bbs (bbs);
4848 bbs.truncate (0);
4849 }
4850 }
4851
4852 if (!bbs.is_empty ())
4853 r |= vect_slp_bbs (bbs);
4854
4855 free (rpo);
4856
4857 return r;
4858 }
4859
4860 /* Build a variable-length vector in which the elements in ELTS are repeated
4861 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4862 RESULTS and add any new instructions to SEQ.
4863
4864 The approach we use is:
4865
4866 (1) Find a vector mode VM with integer elements of mode IM.
4867
4868 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4869 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4870 from small vectors to IM.
4871
4872 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4873
4874 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4875 correct byte contents.
4876
4877 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4878
4879 We try to find the largest IM for which this sequence works, in order
4880 to cut down on the number of interleaves. */
4881
4882 void
4883 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4884 vec<tree> elts, unsigned int nresults,
4885 vec<tree> &results)
4886 {
4887 unsigned int nelts = elts.length ();
4888 tree element_type = TREE_TYPE (vector_type);
4889
4890 /* (1) Find a vector mode VM with integer elements of mode IM. */
4891 unsigned int nvectors = 1;
4892 tree new_vector_type;
4893 tree permutes[2];
4894 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4895 &nvectors, &new_vector_type,
4896 permutes))
4897 gcc_unreachable ();
4898
4899 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4900 unsigned int partial_nelts = nelts / nvectors;
4901 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4902
4903 tree_vector_builder partial_elts;
4904 auto_vec<tree, 32> pieces (nvectors * 2);
4905 pieces.quick_grow (nvectors * 2);
4906 for (unsigned int i = 0; i < nvectors; ++i)
4907 {
4908 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4909 ELTS' has mode IM. */
4910 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4911 for (unsigned int j = 0; j < partial_nelts; ++j)
4912 partial_elts.quick_push (elts[i * partial_nelts + j]);
4913 tree t = gimple_build_vector (seq, &partial_elts);
4914 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4915 TREE_TYPE (new_vector_type), t);
4916
4917 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4918 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4919 }
4920
4921 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4922 correct byte contents.
4923
4924 We need to repeat the following operation log2(nvectors) times:
4925
4926 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4927 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4928
4929 However, if each input repeats every N elements and the VF is
4930 a multiple of N * 2, the HI result is the same as the LO. */
4931 unsigned int in_start = 0;
4932 unsigned int out_start = nvectors;
4933 unsigned int hi_start = nvectors / 2;
4934 /* A bound on the number of outputs needed to produce NRESULTS results
4935 in the final iteration. */
4936 unsigned int noutputs_bound = nvectors * nresults;
4937 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4938 {
4939 noutputs_bound /= 2;
4940 unsigned int limit = MIN (noutputs_bound, nvectors);
4941 for (unsigned int i = 0; i < limit; ++i)
4942 {
4943 if ((i & 1) != 0
4944 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4945 2 * in_repeat))
4946 {
4947 pieces[out_start + i] = pieces[out_start + i - 1];
4948 continue;
4949 }
4950
4951 tree output = make_ssa_name (new_vector_type);
4952 tree input1 = pieces[in_start + (i / 2)];
4953 tree input2 = pieces[in_start + (i / 2) + hi_start];
4954 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4955 input1, input2,
4956 permutes[i & 1]);
4957 gimple_seq_add_stmt (seq, stmt);
4958 pieces[out_start + i] = output;
4959 }
4960 std::swap (in_start, out_start);
4961 }
4962
4963 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4964 results.reserve (nresults);
4965 for (unsigned int i = 0; i < nresults; ++i)
4966 if (i < nvectors)
4967 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4968 pieces[in_start + i]));
4969 else
4970 results.quick_push (results[i - nvectors]);
4971 }
4972
4973
4974 /* For constant and loop invariant defs in OP_NODE this function creates
4975 vector defs that will be used in the vectorized stmts and stores them
4976 to SLP_TREE_VEC_DEFS of OP_NODE. */
4977
4978 static void
4979 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4980 {
4981 unsigned HOST_WIDE_INT nunits;
4982 tree vec_cst;
4983 unsigned j, number_of_places_left_in_vector;
4984 tree vector_type;
4985 tree vop;
4986 int group_size = op_node->ops.length ();
4987 unsigned int vec_num, i;
4988 unsigned number_of_copies = 1;
4989 bool constant_p;
4990 gimple_seq ctor_seq = NULL;
4991 auto_vec<tree, 16> permute_results;
4992
4993 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4994 vector_type = SLP_TREE_VECTYPE (op_node);
4995
4996 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4997 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4998 auto_vec<tree> voprnds (number_of_vectors);
4999
5000 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5001 created vectors. It is greater than 1 if unrolling is performed.
5002
5003 For example, we have two scalar operands, s1 and s2 (e.g., group of
5004 strided accesses of size two), while NUNITS is four (i.e., four scalars
5005 of this type can be packed in a vector). The output vector will contain
5006 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5007 will be 2).
5008
5009 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5010 containing the operands.
5011
5012 For example, NUNITS is four as before, and the group size is 8
5013 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5014 {s5, s6, s7, s8}. */
5015
5016 /* When using duplicate_and_interleave, we just need one element for
5017 each scalar statement. */
5018 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5019 nunits = group_size;
5020
5021 number_of_copies = nunits * number_of_vectors / group_size;
5022
5023 number_of_places_left_in_vector = nunits;
5024 constant_p = true;
5025 tree_vector_builder elts (vector_type, nunits, 1);
5026 elts.quick_grow (nunits);
5027 stmt_vec_info insert_after = NULL;
5028 for (j = 0; j < number_of_copies; j++)
5029 {
5030 tree op;
5031 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5032 {
5033 /* Create 'vect_ = {op0,op1,...,opn}'. */
5034 number_of_places_left_in_vector--;
5035 tree orig_op = op;
5036 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5037 {
5038 if (CONSTANT_CLASS_P (op))
5039 {
5040 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5041 {
5042 /* Can't use VIEW_CONVERT_EXPR for booleans because
5043 of possibly different sizes of scalar value and
5044 vector element. */
5045 if (integer_zerop (op))
5046 op = build_int_cst (TREE_TYPE (vector_type), 0);
5047 else if (integer_onep (op))
5048 op = build_all_ones_cst (TREE_TYPE (vector_type));
5049 else
5050 gcc_unreachable ();
5051 }
5052 else
5053 op = fold_unary (VIEW_CONVERT_EXPR,
5054 TREE_TYPE (vector_type), op);
5055 gcc_assert (op && CONSTANT_CLASS_P (op));
5056 }
5057 else
5058 {
5059 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5060 gimple *init_stmt;
5061 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5062 {
5063 tree true_val
5064 = build_all_ones_cst (TREE_TYPE (vector_type));
5065 tree false_val
5066 = build_zero_cst (TREE_TYPE (vector_type));
5067 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5068 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5069 op, true_val,
5070 false_val);
5071 }
5072 else
5073 {
5074 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5075 op);
5076 init_stmt
5077 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5078 op);
5079 }
5080 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5081 op = new_temp;
5082 }
5083 }
5084 elts[number_of_places_left_in_vector] = op;
5085 if (!CONSTANT_CLASS_P (op))
5086 constant_p = false;
5087 /* For BB vectorization we have to compute an insert location
5088 when a def is inside the analyzed region since we cannot
5089 simply insert at the BB start in this case. */
5090 stmt_vec_info opdef;
5091 if (TREE_CODE (orig_op) == SSA_NAME
5092 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5093 && is_a <bb_vec_info> (vinfo)
5094 && (opdef = vinfo->lookup_def (orig_op)))
5095 {
5096 if (!insert_after)
5097 insert_after = opdef;
5098 else
5099 insert_after = get_later_stmt (insert_after, opdef);
5100 }
5101
5102 if (number_of_places_left_in_vector == 0)
5103 {
5104 if (constant_p
5105 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5106 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5107 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5108 else
5109 {
5110 if (permute_results.is_empty ())
5111 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5112 elts, number_of_vectors,
5113 permute_results);
5114 vec_cst = permute_results[number_of_vectors - j - 1];
5115 }
5116 if (!gimple_seq_empty_p (ctor_seq))
5117 {
5118 if (insert_after)
5119 {
5120 gimple_stmt_iterator gsi;
5121 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5122 {
5123 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5124 gsi_insert_seq_before (&gsi, ctor_seq,
5125 GSI_CONTINUE_LINKING);
5126 }
5127 else if (!stmt_ends_bb_p (insert_after->stmt))
5128 {
5129 gsi = gsi_for_stmt (insert_after->stmt);
5130 gsi_insert_seq_after (&gsi, ctor_seq,
5131 GSI_CONTINUE_LINKING);
5132 }
5133 else
5134 {
5135 /* When we want to insert after a def where the
5136 defining stmt throws then insert on the fallthru
5137 edge. */
5138 edge e = find_fallthru_edge
5139 (gimple_bb (insert_after->stmt)->succs);
5140 basic_block new_bb
5141 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5142 gcc_assert (!new_bb);
5143 }
5144 }
5145 else
5146 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5147 ctor_seq = NULL;
5148 }
5149 voprnds.quick_push (vec_cst);
5150 insert_after = NULL;
5151 number_of_places_left_in_vector = nunits;
5152 constant_p = true;
5153 elts.new_vector (vector_type, nunits, 1);
5154 elts.quick_grow (nunits);
5155 }
5156 }
5157 }
5158
5159 /* Since the vectors are created in the reverse order, we should invert
5160 them. */
5161 vec_num = voprnds.length ();
5162 for (j = vec_num; j != 0; j--)
5163 {
5164 vop = voprnds[j - 1];
5165 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5166 }
5167
5168 /* In case that VF is greater than the unrolling factor needed for the SLP
5169 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5170 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5171 to replicate the vectors. */
5172 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5173 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5174 i++)
5175 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5176 }
5177
5178 /* Get the Ith vectorized definition from SLP_NODE. */
5179
5180 tree
5181 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5182 {
5183 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5184 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5185 else
5186 return SLP_TREE_VEC_DEFS (slp_node)[i];
5187 }
5188
5189 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5190
5191 void
5192 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5193 {
5194 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5195 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5196 {
5197 unsigned j;
5198 gimple *vec_def_stmt;
5199 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5200 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5201 }
5202 else
5203 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5204 }
5205
5206 /* Get N vectorized definitions for SLP_NODE. */
5207
5208 void
5209 vect_get_slp_defs (vec_info *,
5210 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5211 {
5212 if (n == -1U)
5213 n = SLP_TREE_CHILDREN (slp_node).length ();
5214
5215 for (unsigned i = 0; i < n; ++i)
5216 {
5217 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5218 vec<tree> vec_defs = vNULL;
5219 vect_get_slp_defs (child, &vec_defs);
5220 vec_oprnds->quick_push (vec_defs);
5221 }
5222 }
5223
5224 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5225 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5226 permute statements for the SLP node NODE. Store the number of vector
5227 permute instructions in *N_PERMS and the number of vector load
5228 instructions in *N_LOADS. */
5229
5230 bool
5231 vect_transform_slp_perm_load (vec_info *vinfo,
5232 slp_tree node, vec<tree> dr_chain,
5233 gimple_stmt_iterator *gsi, poly_uint64 vf,
5234 bool analyze_only, unsigned *n_perms,
5235 unsigned int *n_loads)
5236 {
5237 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5238 int vec_index = 0;
5239 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5240 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5241 unsigned int mask_element;
5242 machine_mode mode;
5243
5244 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5245 return false;
5246
5247 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5248
5249 mode = TYPE_MODE (vectype);
5250 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5251
5252 /* Initialize the vect stmts of NODE to properly insert the generated
5253 stmts later. */
5254 if (! analyze_only)
5255 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5256 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5257 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5258
5259 /* Generate permutation masks for every NODE. Number of masks for each NODE
5260 is equal to GROUP_SIZE.
5261 E.g., we have a group of three nodes with three loads from the same
5262 location in each node, and the vector size is 4. I.e., we have a
5263 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5264 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5265 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5266 ...
5267
5268 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5269 The last mask is illegal since we assume two operands for permute
5270 operation, and the mask element values can't be outside that range.
5271 Hence, the last mask must be converted into {2,5,5,5}.
5272 For the first two permutations we need the first and the second input
5273 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5274 we need the second and the third vectors: {b1,c1,a2,b2} and
5275 {c2,a3,b3,c3}. */
5276
5277 int vect_stmts_counter = 0;
5278 unsigned int index = 0;
5279 int first_vec_index = -1;
5280 int second_vec_index = -1;
5281 bool noop_p = true;
5282 *n_perms = 0;
5283
5284 vec_perm_builder mask;
5285 unsigned int nelts_to_build;
5286 unsigned int nvectors_per_build;
5287 unsigned int in_nlanes;
5288 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5289 && multiple_p (nunits, group_size));
5290 if (repeating_p)
5291 {
5292 /* A single vector contains a whole number of copies of the node, so:
5293 (a) all permutes can use the same mask; and
5294 (b) the permutes only need a single vector input. */
5295 mask.new_vector (nunits, group_size, 3);
5296 nelts_to_build = mask.encoded_nelts ();
5297 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5298 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5299 }
5300 else
5301 {
5302 /* We need to construct a separate mask for each vector statement. */
5303 unsigned HOST_WIDE_INT const_nunits, const_vf;
5304 if (!nunits.is_constant (&const_nunits)
5305 || !vf.is_constant (&const_vf))
5306 return false;
5307 mask.new_vector (const_nunits, const_nunits, 1);
5308 nelts_to_build = const_vf * group_size;
5309 nvectors_per_build = 1;
5310 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5311 }
5312 auto_sbitmap used_in_lanes (in_nlanes);
5313 bitmap_clear (used_in_lanes);
5314
5315 unsigned int count = mask.encoded_nelts ();
5316 mask.quick_grow (count);
5317 vec_perm_indices indices;
5318
5319 for (unsigned int j = 0; j < nelts_to_build; j++)
5320 {
5321 unsigned int iter_num = j / group_size;
5322 unsigned int stmt_num = j % group_size;
5323 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5324 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5325 bitmap_set_bit (used_in_lanes, i);
5326 if (repeating_p)
5327 {
5328 first_vec_index = 0;
5329 mask_element = i;
5330 }
5331 else
5332 {
5333 /* Enforced before the loop when !repeating_p. */
5334 unsigned int const_nunits = nunits.to_constant ();
5335 vec_index = i / const_nunits;
5336 mask_element = i % const_nunits;
5337 if (vec_index == first_vec_index
5338 || first_vec_index == -1)
5339 {
5340 first_vec_index = vec_index;
5341 }
5342 else if (vec_index == second_vec_index
5343 || second_vec_index == -1)
5344 {
5345 second_vec_index = vec_index;
5346 mask_element += const_nunits;
5347 }
5348 else
5349 {
5350 if (dump_enabled_p ())
5351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5352 "permutation requires at "
5353 "least three vectors %G",
5354 stmt_info->stmt);
5355 gcc_assert (analyze_only);
5356 return false;
5357 }
5358
5359 gcc_assert (mask_element < 2 * const_nunits);
5360 }
5361
5362 if (mask_element != index)
5363 noop_p = false;
5364 mask[index++] = mask_element;
5365
5366 if (index == count && !noop_p)
5367 {
5368 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5369 if (!can_vec_perm_const_p (mode, indices))
5370 {
5371 if (dump_enabled_p ())
5372 {
5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5374 vect_location,
5375 "unsupported vect permute { ");
5376 for (i = 0; i < count; ++i)
5377 {
5378 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5379 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5380 }
5381 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5382 }
5383 gcc_assert (analyze_only);
5384 return false;
5385 }
5386
5387 ++*n_perms;
5388 }
5389
5390 if (index == count)
5391 {
5392 if (!analyze_only)
5393 {
5394 tree mask_vec = NULL_TREE;
5395
5396 if (! noop_p)
5397 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5398
5399 if (second_vec_index == -1)
5400 second_vec_index = first_vec_index;
5401
5402 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5403 {
5404 /* Generate the permute statement if necessary. */
5405 tree first_vec = dr_chain[first_vec_index + ri];
5406 tree second_vec = dr_chain[second_vec_index + ri];
5407 gimple *perm_stmt;
5408 if (! noop_p)
5409 {
5410 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5411 tree perm_dest
5412 = vect_create_destination_var (gimple_assign_lhs (stmt),
5413 vectype);
5414 perm_dest = make_ssa_name (perm_dest);
5415 perm_stmt
5416 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5417 first_vec, second_vec,
5418 mask_vec);
5419 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5420 gsi);
5421 }
5422 else
5423 /* If mask was NULL_TREE generate the requested
5424 identity transform. */
5425 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5426
5427 /* Store the vector statement in NODE. */
5428 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5429 }
5430 }
5431
5432 index = 0;
5433 first_vec_index = -1;
5434 second_vec_index = -1;
5435 noop_p = true;
5436 }
5437 }
5438
5439 if (n_loads)
5440 {
5441 if (repeating_p)
5442 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5443 else
5444 {
5445 /* Enforced above when !repeating_p. */
5446 unsigned int const_nunits = nunits.to_constant ();
5447 *n_loads = 0;
5448 bool load_seen = false;
5449 for (unsigned i = 0; i < in_nlanes; ++i)
5450 {
5451 if (i % const_nunits == 0)
5452 {
5453 if (load_seen)
5454 *n_loads += 1;
5455 load_seen = false;
5456 }
5457 if (bitmap_bit_p (used_in_lanes, i))
5458 load_seen = true;
5459 }
5460 if (load_seen)
5461 *n_loads += 1;
5462 }
5463 }
5464
5465 return true;
5466 }
5467
5468
5469 /* Vectorize the SLP permutations in NODE as specified
5470 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5471 child number and lane number.
5472 Interleaving of two two-lane two-child SLP subtrees (not supported):
5473 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5474 A blend of two four-lane two-child SLP subtrees:
5475 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5476 Highpart of a four-lane one-child SLP subtree (not supported):
5477 [ { 0, 2 }, { 0, 3 } ]
5478 Where currently only a subset is supported by code generating below. */
5479
5480 static bool
5481 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5482 slp_tree node, stmt_vector_for_cost *cost_vec)
5483 {
5484 tree vectype = SLP_TREE_VECTYPE (node);
5485
5486 /* ??? We currently only support all same vector input and output types
5487 while the SLP IL should really do a concat + select and thus accept
5488 arbitrary mismatches. */
5489 slp_tree child;
5490 unsigned i;
5491 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5492 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5493 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5494 {
5495 if (dump_enabled_p ())
5496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5497 "Unsupported lane permutation\n");
5498 return false;
5499 }
5500
5501 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5502 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5503 if (dump_enabled_p ())
5504 {
5505 dump_printf_loc (MSG_NOTE, vect_location,
5506 "vectorizing permutation");
5507 for (unsigned i = 0; i < perm.length (); ++i)
5508 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5509 dump_printf (MSG_NOTE, "\n");
5510 }
5511
5512 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5513 if (!nunits.is_constant ())
5514 return false;
5515 unsigned HOST_WIDE_INT vf = 1;
5516 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5517 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5518 return false;
5519 unsigned olanes = vf * SLP_TREE_LANES (node);
5520 gcc_assert (multiple_p (olanes, nunits));
5521
5522 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5523 from the { SLP operand, scalar lane } permutation as recorded in the
5524 SLP node as intermediate step. This part should already work
5525 with SLP children with arbitrary number of lanes. */
5526 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5527 auto_vec<unsigned> active_lane;
5528 vperm.create (olanes);
5529 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5530 for (unsigned i = 0; i < vf; ++i)
5531 {
5532 for (unsigned pi = 0; pi < perm.length (); ++pi)
5533 {
5534 std::pair<unsigned, unsigned> p = perm[pi];
5535 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5536 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5537 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5538 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5539 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5540 }
5541 /* Advance to the next group. */
5542 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5543 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5544 }
5545
5546 if (dump_enabled_p ())
5547 {
5548 dump_printf_loc (MSG_NOTE, vect_location, "as");
5549 for (unsigned i = 0; i < vperm.length (); ++i)
5550 {
5551 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5552 dump_printf (MSG_NOTE, ",");
5553 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5554 vperm[i].first.first, vperm[i].first.second,
5555 vperm[i].first.second);
5556 }
5557 dump_printf (MSG_NOTE, "\n");
5558 }
5559
5560 /* We can only handle two-vector permutes, everything else should
5561 be lowered on the SLP level. The following is closely inspired
5562 by vect_transform_slp_perm_load and is supposed to eventually
5563 replace it.
5564 ??? As intermediate step do code-gen in the SLP tree representation
5565 somehow? */
5566 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5567 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5568 unsigned int const_nunits = nunits.to_constant ();
5569 unsigned int index = 0;
5570 unsigned int mask_element;
5571 vec_perm_builder mask;
5572 mask.new_vector (const_nunits, const_nunits, 1);
5573 unsigned int count = mask.encoded_nelts ();
5574 mask.quick_grow (count);
5575 vec_perm_indices indices;
5576 unsigned nperms = 0;
5577 for (unsigned i = 0; i < vperm.length (); ++i)
5578 {
5579 mask_element = vperm[i].second;
5580 if (first_vec.first == -1U
5581 || first_vec == vperm[i].first)
5582 first_vec = vperm[i].first;
5583 else if (second_vec.first == -1U
5584 || second_vec == vperm[i].first)
5585 {
5586 second_vec = vperm[i].first;
5587 mask_element += const_nunits;
5588 }
5589 else
5590 {
5591 if (dump_enabled_p ())
5592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5593 "permutation requires at "
5594 "least three vectors\n");
5595 gcc_assert (!gsi);
5596 return false;
5597 }
5598
5599 mask[index++] = mask_element;
5600
5601 if (index == count)
5602 {
5603 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5604 const_nunits);
5605 bool identity_p = indices.series_p (0, 1, 0, 1);
5606 if (!identity_p
5607 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5608 {
5609 if (dump_enabled_p ())
5610 {
5611 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5612 vect_location,
5613 "unsupported vect permute { ");
5614 for (i = 0; i < count; ++i)
5615 {
5616 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5617 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5618 }
5619 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5620 }
5621 gcc_assert (!gsi);
5622 return false;
5623 }
5624
5625 if (!identity_p)
5626 nperms++;
5627 if (gsi)
5628 {
5629 if (second_vec.first == -1U)
5630 second_vec = first_vec;
5631
5632 /* Generate the permute statement if necessary. */
5633 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5634 tree first_def
5635 = vect_get_slp_vect_def (first_node, first_vec.second);
5636 gassign *perm_stmt;
5637 tree perm_dest = make_ssa_name (vectype);
5638 if (!identity_p)
5639 {
5640 slp_tree second_node
5641 = SLP_TREE_CHILDREN (node)[second_vec.first];
5642 tree second_def
5643 = vect_get_slp_vect_def (second_node, second_vec.second);
5644 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5645 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5646 first_def, second_def,
5647 mask_vec);
5648 }
5649 else
5650 /* We need a copy here in case the def was external. */
5651 perm_stmt = gimple_build_assign (perm_dest, first_def);
5652 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5653 /* Store the vector statement in NODE. */
5654 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5655 }
5656
5657 index = 0;
5658 first_vec = std::make_pair (-1U, -1U);
5659 second_vec = std::make_pair (-1U, -1U);
5660 }
5661 }
5662
5663 if (!gsi)
5664 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5665
5666 return true;
5667 }
5668
5669 /* Vectorize SLP NODE. */
5670
5671 static void
5672 vect_schedule_slp_node (vec_info *vinfo,
5673 slp_tree node, slp_instance instance)
5674 {
5675 gimple_stmt_iterator si;
5676 int i;
5677 slp_tree child;
5678
5679 /* For existing vectors there's nothing to do. */
5680 if (SLP_TREE_VEC_DEFS (node).exists ())
5681 return;
5682
5683 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5684
5685 /* Vectorize externals and constants. */
5686 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5687 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5688 {
5689 /* ??? vectorizable_shift can end up using a scalar operand which is
5690 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5691 node in this case. */
5692 if (!SLP_TREE_VECTYPE (node))
5693 return;
5694
5695 vect_create_constant_vectors (vinfo, node);
5696 return;
5697 }
5698
5699 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5700
5701 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5702 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5703
5704 if (dump_enabled_p ())
5705 dump_printf_loc (MSG_NOTE, vect_location,
5706 "------>vectorizing SLP node starting from: %G",
5707 stmt_info->stmt);
5708
5709 if (STMT_VINFO_DATA_REF (stmt_info)
5710 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5711 {
5712 /* Vectorized loads go before the first scalar load to make it
5713 ready early, vectorized stores go before the last scalar
5714 stmt which is where all uses are ready. */
5715 stmt_vec_info last_stmt_info = NULL;
5716 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5717 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5718 else /* DR_IS_WRITE */
5719 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5720 si = gsi_for_stmt (last_stmt_info->stmt);
5721 }
5722 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5723 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5724 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5725 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5726 {
5727 /* For PHI node vectorization we do not use the insertion iterator. */
5728 si = gsi_none ();
5729 }
5730 else
5731 {
5732 /* Emit other stmts after the children vectorized defs which is
5733 earliest possible. */
5734 gimple *last_stmt = NULL;
5735 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5736 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5737 {
5738 /* For fold-left reductions we are retaining the scalar
5739 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5740 set so the representation isn't perfect. Resort to the
5741 last scalar def here. */
5742 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5743 {
5744 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5745 == cycle_phi_info_type);
5746 gphi *phi = as_a <gphi *>
5747 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5748 if (!last_stmt
5749 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5750 last_stmt = phi;
5751 }
5752 /* We are emitting all vectorized stmts in the same place and
5753 the last one is the last.
5754 ??? Unless we have a load permutation applied and that
5755 figures to re-use an earlier generated load. */
5756 unsigned j;
5757 gimple *vstmt;
5758 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5759 if (!last_stmt
5760 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5761 last_stmt = vstmt;
5762 }
5763 else if (!SLP_TREE_VECTYPE (child))
5764 {
5765 /* For externals we use unvectorized at all scalar defs. */
5766 unsigned j;
5767 tree def;
5768 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5769 if (TREE_CODE (def) == SSA_NAME
5770 && !SSA_NAME_IS_DEFAULT_DEF (def))
5771 {
5772 gimple *stmt = SSA_NAME_DEF_STMT (def);
5773 if (!last_stmt
5774 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5775 last_stmt = stmt;
5776 }
5777 }
5778 else
5779 {
5780 /* For externals we have to look at all defs since their
5781 insertion place is decided per vector. But beware
5782 of pre-existing vectors where we need to make sure
5783 we do not insert before the region boundary. */
5784 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5785 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5786 last_stmt = gsi_stmt (gsi_after_labels
5787 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5788 else
5789 {
5790 unsigned j;
5791 tree vdef;
5792 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5793 if (TREE_CODE (vdef) == SSA_NAME
5794 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5795 {
5796 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5797 if (!last_stmt
5798 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5799 last_stmt = vstmt;
5800 }
5801 }
5802 }
5803 /* This can happen when all children are pre-existing vectors or
5804 constants. */
5805 if (!last_stmt)
5806 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5807 if (is_a <gphi *> (last_stmt))
5808 si = gsi_after_labels (gimple_bb (last_stmt));
5809 else
5810 {
5811 si = gsi_for_stmt (last_stmt);
5812 gsi_next (&si);
5813 }
5814 }
5815
5816 bool done_p = false;
5817
5818 /* Handle purely internal nodes. */
5819 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5820 {
5821 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5822 be shared with different SLP nodes (but usually it's the same
5823 operation apart from the case the stmt is only there for denoting
5824 the actual scalar lane defs ...). So do not call vect_transform_stmt
5825 but open-code it here (partly). */
5826 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5827 gcc_assert (done);
5828 done_p = true;
5829 }
5830 if (!done_p)
5831 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5832 }
5833
5834 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5835 For loop vectorization this is done in vectorizable_call, but for SLP
5836 it needs to be deferred until end of vect_schedule_slp, because multiple
5837 SLP instances may refer to the same scalar stmt. */
5838
5839 static void
5840 vect_remove_slp_scalar_calls (vec_info *vinfo,
5841 slp_tree node, hash_set<slp_tree> &visited)
5842 {
5843 gimple *new_stmt;
5844 gimple_stmt_iterator gsi;
5845 int i;
5846 slp_tree child;
5847 tree lhs;
5848 stmt_vec_info stmt_info;
5849
5850 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5851 return;
5852
5853 if (visited.add (node))
5854 return;
5855
5856 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5857 vect_remove_slp_scalar_calls (vinfo, child, visited);
5858
5859 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5860 {
5861 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5862 if (!stmt || gimple_bb (stmt) == NULL)
5863 continue;
5864 if (is_pattern_stmt_p (stmt_info)
5865 || !PURE_SLP_STMT (stmt_info))
5866 continue;
5867 lhs = gimple_call_lhs (stmt);
5868 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5869 gsi = gsi_for_stmt (stmt);
5870 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5871 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5872 }
5873 }
5874
5875 static void
5876 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5877 {
5878 hash_set<slp_tree> visited;
5879 vect_remove_slp_scalar_calls (vinfo, node, visited);
5880 }
5881
5882 /* Vectorize the instance root. */
5883
5884 void
5885 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5886 {
5887 gassign *rstmt = NULL;
5888
5889 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5890 {
5891 gimple *child_stmt;
5892 int j;
5893
5894 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5895 {
5896 tree vect_lhs = gimple_get_lhs (child_stmt);
5897 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5898 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5899 TREE_TYPE (vect_lhs)))
5900 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5901 vect_lhs);
5902 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5903 break;
5904 }
5905 }
5906 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5907 {
5908 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5909 gimple *child_stmt;
5910 int j;
5911 vec<constructor_elt, va_gc> *v;
5912 vec_alloc (v, nelts);
5913
5914 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5915 {
5916 CONSTRUCTOR_APPEND_ELT (v,
5917 NULL_TREE,
5918 gimple_get_lhs (child_stmt));
5919 }
5920 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5921 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5922 tree r_constructor = build_constructor (rtype, v);
5923 rstmt = gimple_build_assign (lhs, r_constructor);
5924 }
5925
5926 gcc_assert (rstmt);
5927
5928 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5929 gsi_replace (&rgsi, rstmt, true);
5930 }
5931
5932 struct slp_scc_info
5933 {
5934 bool on_stack;
5935 int dfs;
5936 int lowlink;
5937 };
5938
5939 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5940
5941 static void
5942 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5943 hash_map<slp_tree, slp_scc_info> &scc_info,
5944 int &maxdfs, vec<slp_tree> &stack)
5945 {
5946 bool existed_p;
5947 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5948 gcc_assert (!existed_p);
5949 info->dfs = maxdfs;
5950 info->lowlink = maxdfs;
5951 maxdfs++;
5952
5953 /* Leaf. */
5954 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5955 {
5956 info->on_stack = false;
5957 vect_schedule_slp_node (vinfo, node, instance);
5958 return;
5959 }
5960
5961 info->on_stack = true;
5962 stack.safe_push (node);
5963
5964 unsigned i;
5965 slp_tree child;
5966 /* DFS recurse. */
5967 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5968 {
5969 if (!child)
5970 continue;
5971 slp_scc_info *child_info = scc_info.get (child);
5972 if (!child_info)
5973 {
5974 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5975 /* Recursion might have re-allocated the node. */
5976 info = scc_info.get (node);
5977 child_info = scc_info.get (child);
5978 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5979 }
5980 else if (child_info->on_stack)
5981 info->lowlink = MIN (info->lowlink, child_info->dfs);
5982 }
5983 if (info->lowlink != info->dfs)
5984 return;
5985
5986 auto_vec<slp_tree, 4> phis_to_fixup;
5987
5988 /* Singleton. */
5989 if (stack.last () == node)
5990 {
5991 stack.pop ();
5992 info->on_stack = false;
5993 vect_schedule_slp_node (vinfo, node, instance);
5994 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5995 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5996 phis_to_fixup.quick_push (node);
5997 }
5998 else
5999 {
6000 /* SCC. */
6001 int last_idx = stack.length () - 1;
6002 while (stack[last_idx] != node)
6003 last_idx--;
6004 /* We can break the cycle at PHIs who have at least one child
6005 code generated. Then we could re-start the DFS walk until
6006 all nodes in the SCC are covered (we might have new entries
6007 for only back-reachable nodes). But it's simpler to just
6008 iterate and schedule those that are ready. */
6009 unsigned todo = stack.length () - last_idx;
6010 do
6011 {
6012 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6013 {
6014 slp_tree entry = stack[idx];
6015 if (!entry)
6016 continue;
6017 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6018 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6019 bool ready = !phi;
6020 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6021 if (!child)
6022 {
6023 gcc_assert (phi);
6024 ready = true;
6025 break;
6026 }
6027 else if (scc_info.get (child)->on_stack)
6028 {
6029 if (!phi)
6030 {
6031 ready = false;
6032 break;
6033 }
6034 }
6035 else
6036 {
6037 if (phi)
6038 {
6039 ready = true;
6040 break;
6041 }
6042 }
6043 if (ready)
6044 {
6045 vect_schedule_slp_node (vinfo, entry, instance);
6046 scc_info.get (entry)->on_stack = false;
6047 stack[idx] = NULL;
6048 todo--;
6049 if (phi)
6050 phis_to_fixup.safe_push (entry);
6051 }
6052 }
6053 }
6054 while (todo != 0);
6055
6056 /* Pop the SCC. */
6057 stack.truncate (last_idx);
6058 }
6059
6060 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6061 slp_tree phi_node;
6062 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6063 {
6064 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6065 edge_iterator ei;
6066 edge e;
6067 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6068 {
6069 unsigned dest_idx = e->dest_idx;
6070 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6071 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6072 continue;
6073 /* Simply fill all args. */
6074 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6075 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6076 vect_get_slp_vect_def (child, i),
6077 e, gimple_phi_arg_location (phi, dest_idx));
6078 }
6079 }
6080 }
6081
6082 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6083
6084 void
6085 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6086 {
6087 slp_instance instance;
6088 unsigned int i;
6089
6090 hash_map<slp_tree, slp_scc_info> scc_info;
6091 int maxdfs = 0;
6092 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6093 {
6094 slp_tree node = SLP_INSTANCE_TREE (instance);
6095 if (dump_enabled_p ())
6096 {
6097 dump_printf_loc (MSG_NOTE, vect_location,
6098 "Vectorizing SLP tree:\n");
6099 if (SLP_INSTANCE_ROOT_STMT (instance))
6100 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6101 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6102 vect_print_slp_graph (MSG_NOTE, vect_location,
6103 SLP_INSTANCE_TREE (instance));
6104 }
6105 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6106 have a PHI be the node breaking the cycle. */
6107 auto_vec<slp_tree> stack;
6108 if (!scc_info.get (node))
6109 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6110
6111 if (SLP_INSTANCE_ROOT_STMT (instance))
6112 vectorize_slp_instance_root_stmt (node, instance);
6113
6114 if (dump_enabled_p ())
6115 dump_printf_loc (MSG_NOTE, vect_location,
6116 "vectorizing stmts using SLP.\n");
6117 }
6118
6119 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6120 {
6121 slp_tree root = SLP_INSTANCE_TREE (instance);
6122 stmt_vec_info store_info;
6123 unsigned int j;
6124
6125 /* Remove scalar call stmts. Do not do this for basic-block
6126 vectorization as not all uses may be vectorized.
6127 ??? Why should this be necessary? DCE should be able to
6128 remove the stmts itself.
6129 ??? For BB vectorization we can as well remove scalar
6130 stmts starting from the SLP tree root if they have no
6131 uses. */
6132 if (is_a <loop_vec_info> (vinfo))
6133 vect_remove_slp_scalar_calls (vinfo, root);
6134
6135 /* Remove vectorized stores original scalar stmts. */
6136 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6137 {
6138 if (!STMT_VINFO_DATA_REF (store_info)
6139 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6140 break;
6141
6142 store_info = vect_orig_stmt (store_info);
6143 /* Free the attached stmt_vec_info and remove the stmt. */
6144 vinfo->remove_stmt (store_info);
6145 }
6146 }
6147 }