Fix iteration over loads in SLP optimize
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 vec<stmt_vec_info> stmts, unsigned stmt_num,
417 vec<slp_oprnd_info> *oprnds_info)
418 {
419 stmt_vec_info stmt_info = stmts[stmt_num];
420 tree oprnd;
421 unsigned int i, number_of_oprnds;
422 enum vect_def_type dt = vect_uninitialized_def;
423 slp_oprnd_info oprnd_info;
424 int first_op_idx = 1;
425 unsigned int commutative_op = -1U;
426 bool first_op_cond = false;
427 bool first = stmt_num == 0;
428
429 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
430 {
431 number_of_oprnds = gimple_call_num_args (stmt);
432 first_op_idx = 3;
433 if (gimple_call_internal_p (stmt))
434 {
435 internal_fn ifn = gimple_call_internal_fn (stmt);
436 commutative_op = first_commutative_argument (ifn);
437
438 /* Masked load, only look at mask. */
439 if (ifn == IFN_MASK_LOAD)
440 {
441 number_of_oprnds = 1;
442 /* Mask operand index. */
443 first_op_idx = 5;
444 }
445 }
446 }
447 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
448 {
449 enum tree_code code = gimple_assign_rhs_code (stmt);
450 number_of_oprnds = gimple_num_ops (stmt) - 1;
451 /* Swap can only be done for cond_expr if asked to, otherwise we
452 could result in different comparison code to the first stmt. */
453 if (code == COND_EXPR
454 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
455 {
456 first_op_cond = true;
457 number_of_oprnds++;
458 }
459 else
460 commutative_op = commutative_tree_code (code) ? 0U : -1U;
461 }
462 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
463 number_of_oprnds = gimple_phi_num_args (stmt);
464 else
465 return -1;
466
467 bool swapped = (swap != 0);
468 bool backedge = false;
469 gcc_assert (!swapped || first_op_cond);
470 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
471 for (i = 0; i < number_of_oprnds; i++)
472 {
473 if (first_op_cond)
474 {
475 /* Map indicating how operands of cond_expr should be swapped. */
476 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
477 int *map = maps[swap];
478
479 if (i < 2)
480 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
481 first_op_idx), map[i]);
482 else
483 oprnd = gimple_op (stmt_info->stmt, map[i]);
484 }
485 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
486 {
487 oprnd = gimple_phi_arg_def (stmt, i);
488 backedge = dominated_by_p (CDI_DOMINATORS,
489 gimple_phi_arg_edge (stmt, i)->src,
490 gimple_bb (stmt_info->stmt));
491 }
492 else
493 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
494 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
495 oprnd = TREE_OPERAND (oprnd, 0);
496
497 oprnd_info = (*oprnds_info)[i];
498
499 stmt_vec_info def_stmt_info;
500 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
501 {
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
504 "Build SLP failed: can't analyze def for %T\n",
505 oprnd);
506
507 return -1;
508 }
509
510 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
511 oprnd_info->any_pattern = true;
512
513 oprnd_info->def_stmts.quick_push (def_stmt_info);
514 oprnd_info->ops.quick_push (oprnd);
515
516 /* If there's a extern def on a backedge make sure we can
517 code-generate at the region start.
518 ??? This is another case that could be fixed by adjusting
519 how we split the function but at the moment we'd have conflicting
520 goals there. */
521 if (backedge
522 && dts[i] == vect_external_def
523 && is_a <bb_vec_info> (vinfo)
524 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
525 && !dominated_by_p (CDI_DOMINATORS,
526 as_a <bb_vec_info> (vinfo)->bbs[0],
527 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
528 {
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
531 "Build SLP failed: extern def %T only defined "
532 "on backedge\n", oprnd);
533 return -1;
534 }
535
536 if (first)
537 {
538 tree type = TREE_TYPE (oprnd);
539 dt = dts[i];
540 if ((dt == vect_constant_def
541 || dt == vect_external_def)
542 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
543 && (TREE_CODE (type) == BOOLEAN_TYPE
544 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
545 type)))
546 {
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
549 "Build SLP failed: invalid type of def "
550 "for variable-length SLP %T\n", oprnd);
551 return -1;
552 }
553
554 /* For the swapping logic below force vect_reduction_def
555 for the reduction op in a SLP reduction group. */
556 if (!STMT_VINFO_DATA_REF (stmt_info)
557 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
558 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
559 && def_stmt_info)
560 dts[i] = dt = vect_reduction_def;
561
562 /* Check the types of the definition. */
563 switch (dt)
564 {
565 case vect_external_def:
566 case vect_constant_def:
567 case vect_internal_def:
568 case vect_reduction_def:
569 case vect_induction_def:
570 case vect_nested_cycle:
571 break;
572
573 default:
574 /* FORNOW: Not supported. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "Build SLP failed: illegal type of def %T\n",
578 oprnd);
579 return -1;
580 }
581
582 oprnd_info->first_dt = dt;
583 oprnd_info->first_op_type = type;
584 }
585 }
586 if (first)
587 return 0;
588
589 /* Now match the operand definition types to that of the first stmt. */
590 for (i = 0; i < number_of_oprnds;)
591 {
592 oprnd_info = (*oprnds_info)[i];
593 dt = dts[i];
594 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
595 oprnd = oprnd_info->ops[stmt_num];
596 tree type = TREE_TYPE (oprnd);
597
598 if (!types_compatible_p (oprnd_info->first_op_type, type))
599 {
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
602 "Build SLP failed: different operand types\n");
603 return 1;
604 }
605
606 /* Not first stmt of the group, check that the def-stmt/s match
607 the def-stmt/s of the first stmt. Allow different definition
608 types for reduction chains: the first stmt must be a
609 vect_reduction_def (a phi node), and the rest
610 end in the reduction chain. */
611 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
612 && !(oprnd_info->first_dt == vect_reduction_def
613 && !STMT_VINFO_DATA_REF (stmt_info)
614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
615 && def_stmt_info
616 && !STMT_VINFO_DATA_REF (def_stmt_info)
617 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
618 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
619 || (!STMT_VINFO_DATA_REF (stmt_info)
620 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
621 && ((!def_stmt_info
622 || STMT_VINFO_DATA_REF (def_stmt_info)
623 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
624 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
625 != (oprnd_info->first_dt != vect_reduction_def))))
626 {
627 /* Try swapping operands if we got a mismatch. For BB
628 vectorization only in case it will clearly improve things. */
629 if (i == commutative_op && !swapped
630 && (!is_a <bb_vec_info> (vinfo)
631 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
632 dts[i+1])
633 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
634 || vect_def_types_match
635 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
636 {
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_NOTE, vect_location,
639 "trying swapped operands\n");
640 std::swap (dts[i], dts[i+1]);
641 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
642 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
643 std::swap ((*oprnds_info)[i]->ops[stmt_num],
644 (*oprnds_info)[i+1]->ops[stmt_num]);
645 swapped = true;
646 continue;
647 }
648
649 if (is_a <bb_vec_info> (vinfo)
650 && !oprnd_info->any_pattern)
651 {
652 /* Now for commutative ops we should see whether we can
653 make the other operand matching. */
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "treating operand as external\n");
657 oprnd_info->first_dt = dt = vect_external_def;
658 }
659 else
660 {
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "Build SLP failed: different types\n");
664 return 1;
665 }
666 }
667
668 /* Make sure to demote the overall operand to external. */
669 if (dt == vect_external_def)
670 oprnd_info->first_dt = vect_external_def;
671 /* For a SLP reduction chain we want to duplicate the reduction to
672 each of the chain members. That gets us a sane SLP graph (still
673 the stmts are not 100% correct wrt the initial values). */
674 else if ((dt == vect_internal_def
675 || dt == vect_reduction_def)
676 && oprnd_info->first_dt == vect_reduction_def
677 && !STMT_VINFO_DATA_REF (stmt_info)
678 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
679 && !STMT_VINFO_DATA_REF (def_stmt_info)
680 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
681 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
682 {
683 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
684 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
685 }
686
687 ++i;
688 }
689
690 /* Swap operands. */
691 if (swapped)
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "swapped operands to match def types in %G",
696 stmt_info->stmt);
697 }
698
699 return 0;
700 }
701
702 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
703 Return true if we can, meaning that this choice doesn't conflict with
704 existing SLP nodes that use STMT_INFO. */
705
706 bool
707 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
708 {
709 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
710 if (old_vectype)
711 return useless_type_conversion_p (vectype, old_vectype);
712
713 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
714 {
715 /* We maintain the invariant that if any statement in the group is
716 used, all other members of the group have the same vector type. */
717 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
718 stmt_vec_info member_info = first_info;
719 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
720 if (is_pattern_stmt_p (member_info)
721 && !useless_type_conversion_p (vectype,
722 STMT_VINFO_VECTYPE (member_info)))
723 break;
724
725 if (!member_info)
726 {
727 for (member_info = first_info; member_info;
728 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
729 STMT_VINFO_VECTYPE (member_info) = vectype;
730 return true;
731 }
732 }
733 else if (!is_pattern_stmt_p (stmt_info))
734 {
735 STMT_VINFO_VECTYPE (stmt_info) = vectype;
736 return true;
737 }
738
739 if (dump_enabled_p ())
740 {
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
742 "Build SLP failed: incompatible vector"
743 " types for: %G", stmt_info->stmt);
744 dump_printf_loc (MSG_NOTE, vect_location,
745 " old vector type: %T\n", old_vectype);
746 dump_printf_loc (MSG_NOTE, vect_location,
747 " new vector type: %T\n", vectype);
748 }
749 return false;
750 }
751
752 /* Return true if call statements CALL1 and CALL2 are similar enough
753 to be combined into the same SLP group. */
754
755 static bool
756 compatible_calls_p (gcall *call1, gcall *call2)
757 {
758 unsigned int nargs = gimple_call_num_args (call1);
759 if (nargs != gimple_call_num_args (call2))
760 return false;
761
762 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
763 return false;
764
765 if (gimple_call_internal_p (call1))
766 {
767 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
768 TREE_TYPE (gimple_call_lhs (call2))))
769 return false;
770 for (unsigned int i = 0; i < nargs; ++i)
771 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
772 TREE_TYPE (gimple_call_arg (call2, i))))
773 return false;
774 }
775 else
776 {
777 if (!operand_equal_p (gimple_call_fn (call1),
778 gimple_call_fn (call2), 0))
779 return false;
780
781 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
782 return false;
783 }
784 return true;
785 }
786
787 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
788 caller's attempt to find the vector type in STMT_INFO with the narrowest
789 element type. Return true if VECTYPE is nonnull and if it is valid
790 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
791 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
792 vect_build_slp_tree. */
793
794 static bool
795 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
796 unsigned int group_size,
797 tree vectype, poly_uint64 *max_nunits)
798 {
799 if (!vectype)
800 {
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "Build SLP failed: unsupported data-type in %G\n",
804 stmt_info->stmt);
805 /* Fatal mismatch. */
806 return false;
807 }
808
809 /* If populating the vector type requires unrolling then fail
810 before adjusting *max_nunits for basic-block vectorization. */
811 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
812 unsigned HOST_WIDE_INT const_nunits;
813 if (is_a <bb_vec_info> (vinfo)
814 && (!nunits.is_constant (&const_nunits)
815 || const_nunits > group_size))
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unrolling required "
820 "in basic block SLP\n");
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* In case of multiple types we need to detect the smallest type. */
826 vect_update_max_nunits (max_nunits, vectype);
827 return true;
828 }
829
830 /* Verify if the scalar stmts STMTS are isomorphic, require data
831 permutation or are of unsupported types of operation. Return
832 true if they are, otherwise return false and indicate in *MATCHES
833 which stmts are not isomorphic to the first one. If MATCHES[0]
834 is false then this indicates the comparison could not be
835 carried out or the stmts will never be vectorized by SLP.
836
837 Note COND_EXPR is possibly isomorphic to another one after swapping its
838 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
839 the first stmt by swapping the two operands of comparison; set SWAP[i]
840 to 2 if stmt I is isormorphic to the first stmt by inverting the code
841 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
842 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
843
844 static bool
845 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
846 vec<stmt_vec_info> stmts, unsigned int group_size,
847 poly_uint64 *max_nunits, bool *matches,
848 bool *two_operators, tree *node_vectype)
849 {
850 unsigned int i;
851 stmt_vec_info first_stmt_info = stmts[0];
852 enum tree_code first_stmt_code = ERROR_MARK;
853 enum tree_code alt_stmt_code = ERROR_MARK;
854 enum tree_code rhs_code = ERROR_MARK;
855 enum tree_code first_cond_code = ERROR_MARK;
856 tree lhs;
857 bool need_same_oprnds = false;
858 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
859 optab optab;
860 int icode;
861 machine_mode optab_op2_mode;
862 machine_mode vec_mode;
863 stmt_vec_info first_load = NULL, prev_first_load = NULL;
864 bool first_stmt_load_p = false, load_p = false;
865 bool first_stmt_phi_p = false, phi_p = false;
866
867 /* For every stmt in NODE find its def stmt/s. */
868 stmt_vec_info stmt_info;
869 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
870 {
871 gimple *stmt = stmt_info->stmt;
872 swap[i] = 0;
873 matches[i] = false;
874
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
877
878 /* Fail to vectorize statements marked as unvectorizable, throw
879 or are volatile. */
880 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
881 || stmt_can_throw_internal (cfun, stmt)
882 || gimple_has_volatile_ops (stmt))
883 {
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
886 "Build SLP failed: unvectorizable statement %G",
887 stmt);
888 /* ??? For BB vectorization we want to commutate operands in a way
889 to shuffle all unvectorizable defs into one operand and have
890 the other still vectorized. The following doesn't reliably
891 work for this though but it's the easiest we can do here. */
892 if (is_a <bb_vec_info> (vinfo) && i != 0)
893 continue;
894 /* Fatal mismatch. */
895 matches[0] = false;
896 return false;
897 }
898
899 lhs = gimple_get_lhs (stmt);
900 if (lhs == NULL_TREE)
901 {
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
904 "Build SLP failed: not GIMPLE_ASSIGN nor "
905 "GIMPLE_CALL %G", stmt);
906 if (is_a <bb_vec_info> (vinfo) && i != 0)
907 continue;
908 /* Fatal mismatch. */
909 matches[0] = false;
910 return false;
911 }
912
913 tree nunits_vectype;
914 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
915 &nunits_vectype, group_size)
916 || (nunits_vectype
917 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
918 nunits_vectype, max_nunits)))
919 {
920 if (is_a <bb_vec_info> (vinfo) && i != 0)
921 continue;
922 /* Fatal mismatch. */
923 matches[0] = false;
924 return false;
925 }
926
927 gcc_assert (vectype);
928
929 gcall *call_stmt = dyn_cast <gcall *> (stmt);
930 if (call_stmt)
931 {
932 rhs_code = CALL_EXPR;
933
934 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
935 load_p = true;
936 else if ((gimple_call_internal_p (call_stmt)
937 && (!vectorizable_internal_fn_p
938 (gimple_call_internal_fn (call_stmt))))
939 || gimple_call_tail_p (call_stmt)
940 || gimple_call_noreturn_p (call_stmt)
941 || !gimple_call_nothrow_p (call_stmt)
942 || gimple_call_chain (call_stmt))
943 {
944 if (dump_enabled_p ())
945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
946 "Build SLP failed: unsupported call type %G",
947 call_stmt);
948 if (is_a <bb_vec_info> (vinfo) && i != 0)
949 continue;
950 /* Fatal mismatch. */
951 matches[0] = false;
952 return false;
953 }
954 }
955 else if (gimple_code (stmt) == GIMPLE_PHI)
956 {
957 rhs_code = ERROR_MARK;
958 phi_p = true;
959 }
960 else
961 {
962 rhs_code = gimple_assign_rhs_code (stmt);
963 load_p = gimple_vuse (stmt);
964 }
965
966 /* Check the operation. */
967 if (i == 0)
968 {
969 *node_vectype = vectype;
970 first_stmt_code = rhs_code;
971 first_stmt_load_p = load_p;
972 first_stmt_phi_p = phi_p;
973
974 /* Shift arguments should be equal in all the packed stmts for a
975 vector shift with scalar shift operand. */
976 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
977 || rhs_code == LROTATE_EXPR
978 || rhs_code == RROTATE_EXPR)
979 {
980 vec_mode = TYPE_MODE (vectype);
981
982 /* First see if we have a vector/vector shift. */
983 optab = optab_for_tree_code (rhs_code, vectype,
984 optab_vector);
985
986 if (!optab
987 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
988 {
989 /* No vector/vector shift, try for a vector/scalar shift. */
990 optab = optab_for_tree_code (rhs_code, vectype,
991 optab_scalar);
992
993 if (!optab)
994 {
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
997 "Build SLP failed: no optab.\n");
998 if (is_a <bb_vec_info> (vinfo) && i != 0)
999 continue;
1000 /* Fatal mismatch. */
1001 matches[0] = false;
1002 return false;
1003 }
1004 icode = (int) optab_handler (optab, vec_mode);
1005 if (icode == CODE_FOR_nothing)
1006 {
1007 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1009 "Build SLP failed: "
1010 "op not supported by target.\n");
1011 if (is_a <bb_vec_info> (vinfo) && i != 0)
1012 continue;
1013 /* Fatal mismatch. */
1014 matches[0] = false;
1015 return false;
1016 }
1017 optab_op2_mode = insn_data[icode].operand[2].mode;
1018 if (!VECTOR_MODE_P (optab_op2_mode))
1019 {
1020 need_same_oprnds = true;
1021 first_op1 = gimple_assign_rhs2 (stmt);
1022 }
1023 }
1024 }
1025 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1026 {
1027 need_same_oprnds = true;
1028 first_op1 = gimple_assign_rhs2 (stmt);
1029 }
1030 else if (!load_p
1031 && rhs_code == BIT_FIELD_REF)
1032 {
1033 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1034 if (TREE_CODE (vec) != SSA_NAME
1035 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1036 {
1037 if (is_a <bb_vec_info> (vinfo) && i != 0)
1038 continue;
1039 if (dump_enabled_p ())
1040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 "Build SLP failed: "
1042 "BIT_FIELD_REF not supported\n");
1043 /* Fatal mismatch. */
1044 matches[0] = false;
1045 return false;
1046 }
1047 }
1048 else if (call_stmt
1049 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1050 {
1051 need_same_oprnds = true;
1052 first_op1 = gimple_call_arg (call_stmt, 1);
1053 }
1054 }
1055 else
1056 {
1057 if (first_stmt_code != rhs_code
1058 && alt_stmt_code == ERROR_MARK)
1059 alt_stmt_code = rhs_code;
1060 if ((first_stmt_code != rhs_code
1061 && (first_stmt_code != IMAGPART_EXPR
1062 || rhs_code != REALPART_EXPR)
1063 && (first_stmt_code != REALPART_EXPR
1064 || rhs_code != IMAGPART_EXPR)
1065 /* Handle mismatches in plus/minus by computing both
1066 and merging the results. */
1067 && !((first_stmt_code == PLUS_EXPR
1068 || first_stmt_code == MINUS_EXPR)
1069 && (alt_stmt_code == PLUS_EXPR
1070 || alt_stmt_code == MINUS_EXPR)
1071 && rhs_code == alt_stmt_code)
1072 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1073 && (first_stmt_code == ARRAY_REF
1074 || first_stmt_code == BIT_FIELD_REF
1075 || first_stmt_code == INDIRECT_REF
1076 || first_stmt_code == COMPONENT_REF
1077 || first_stmt_code == MEM_REF)))
1078 || first_stmt_load_p != load_p
1079 || first_stmt_phi_p != phi_p)
1080 {
1081 if (dump_enabled_p ())
1082 {
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: different operation "
1085 "in stmt %G", stmt);
1086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1087 "original stmt %G", first_stmt_info->stmt);
1088 }
1089 /* Mismatch. */
1090 continue;
1091 }
1092
1093 if (need_same_oprnds)
1094 {
1095 tree other_op1 = (call_stmt
1096 ? gimple_call_arg (call_stmt, 1)
1097 : gimple_assign_rhs2 (stmt));
1098 if (!operand_equal_p (first_op1, other_op1, 0))
1099 {
1100 if (dump_enabled_p ())
1101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1102 "Build SLP failed: different shift "
1103 "arguments in %G", stmt);
1104 /* Mismatch. */
1105 continue;
1106 }
1107 }
1108 if (!load_p
1109 && first_stmt_code == BIT_FIELD_REF
1110 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1111 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1112 {
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "Build SLP failed: different BIT_FIELD_REF "
1116 "arguments in %G", stmt);
1117 /* Mismatch. */
1118 continue;
1119 }
1120
1121 if (!load_p && rhs_code == CALL_EXPR)
1122 {
1123 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1124 as_a <gcall *> (stmt)))
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Build SLP failed: different calls in %G",
1129 stmt);
1130 /* Mismatch. */
1131 continue;
1132 }
1133 }
1134
1135 if (phi_p
1136 && (gimple_bb (first_stmt_info->stmt)
1137 != gimple_bb (stmt_info->stmt)))
1138 {
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "Build SLP failed: different BB for PHI "
1142 "in %G", stmt);
1143 /* Mismatch. */
1144 continue;
1145 }
1146
1147 if (!types_compatible_p (vectype, *node_vectype))
1148 {
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "Build SLP failed: different vector type "
1152 "in %G", stmt);
1153 /* Mismatch. */
1154 continue;
1155 }
1156 }
1157
1158 /* Grouped store or load. */
1159 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1160 {
1161 if (REFERENCE_CLASS_P (lhs))
1162 {
1163 /* Store. */
1164 ;
1165 }
1166 else
1167 {
1168 /* Load. */
1169 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1170 if (prev_first_load)
1171 {
1172 /* Check that there are no loads from different interleaving
1173 chains in the same node. */
1174 if (prev_first_load != first_load)
1175 {
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1178 vect_location,
1179 "Build SLP failed: different "
1180 "interleaving chains in one node %G",
1181 stmt);
1182 /* Mismatch. */
1183 continue;
1184 }
1185 }
1186 else
1187 prev_first_load = first_load;
1188 }
1189 } /* Grouped access. */
1190 else
1191 {
1192 if (load_p)
1193 {
1194 /* Not grouped load. */
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "Build SLP failed: not grouped load %G", stmt);
1198
1199 /* FORNOW: Not grouped loads are not supported. */
1200 if (is_a <bb_vec_info> (vinfo) && i != 0)
1201 continue;
1202 /* Fatal mismatch. */
1203 matches[0] = false;
1204 return false;
1205 }
1206
1207 /* Not memory operation. */
1208 if (!phi_p
1209 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1210 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1211 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1212 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1213 && rhs_code != VIEW_CONVERT_EXPR
1214 && rhs_code != CALL_EXPR
1215 && rhs_code != BIT_FIELD_REF)
1216 {
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219 "Build SLP failed: operation unsupported %G",
1220 stmt);
1221 if (is_a <bb_vec_info> (vinfo) && i != 0)
1222 continue;
1223 /* Fatal mismatch. */
1224 matches[0] = false;
1225 return false;
1226 }
1227
1228 if (rhs_code == COND_EXPR)
1229 {
1230 tree cond_expr = gimple_assign_rhs1 (stmt);
1231 enum tree_code cond_code = TREE_CODE (cond_expr);
1232 enum tree_code swap_code = ERROR_MARK;
1233 enum tree_code invert_code = ERROR_MARK;
1234
1235 if (i == 0)
1236 first_cond_code = TREE_CODE (cond_expr);
1237 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1238 {
1239 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1240 swap_code = swap_tree_comparison (cond_code);
1241 invert_code = invert_tree_comparison (cond_code, honor_nans);
1242 }
1243
1244 if (first_cond_code == cond_code)
1245 ;
1246 /* Isomorphic can be achieved by swapping. */
1247 else if (first_cond_code == swap_code)
1248 swap[i] = 1;
1249 /* Isomorphic can be achieved by inverting. */
1250 else if (first_cond_code == invert_code)
1251 swap[i] = 2;
1252 else
1253 {
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "Build SLP failed: different"
1257 " operation %G", stmt);
1258 /* Mismatch. */
1259 continue;
1260 }
1261 }
1262 }
1263
1264 matches[i] = true;
1265 }
1266
1267 for (i = 0; i < group_size; ++i)
1268 if (!matches[i])
1269 return false;
1270
1271 /* If we allowed a two-operation SLP node verify the target can cope
1272 with the permute we are going to use. */
1273 if (alt_stmt_code != ERROR_MARK
1274 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1275 {
1276 *two_operators = true;
1277 }
1278
1279 return true;
1280 }
1281
1282 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1283 Note we never remove apart from at destruction time so we do not
1284 need a special value for deleted that differs from empty. */
1285 struct bst_traits
1286 {
1287 typedef vec <stmt_vec_info> value_type;
1288 typedef vec <stmt_vec_info> compare_type;
1289 static inline hashval_t hash (value_type);
1290 static inline bool equal (value_type existing, value_type candidate);
1291 static inline bool is_empty (value_type x) { return !x.exists (); }
1292 static inline bool is_deleted (value_type x) { return !x.exists (); }
1293 static const bool empty_zero_p = true;
1294 static inline void mark_empty (value_type &x) { x.release (); }
1295 static inline void mark_deleted (value_type &x) { x.release (); }
1296 static inline void remove (value_type &x) { x.release (); }
1297 };
1298 inline hashval_t
1299 bst_traits::hash (value_type x)
1300 {
1301 inchash::hash h;
1302 for (unsigned i = 0; i < x.length (); ++i)
1303 h.add_int (gimple_uid (x[i]->stmt));
1304 return h.end ();
1305 }
1306 inline bool
1307 bst_traits::equal (value_type existing, value_type candidate)
1308 {
1309 if (existing.length () != candidate.length ())
1310 return false;
1311 for (unsigned i = 0; i < existing.length (); ++i)
1312 if (existing[i] != candidate[i])
1313 return false;
1314 return true;
1315 }
1316
1317 typedef hash_map <vec <gimple *>, slp_tree,
1318 simple_hashmap_traits <bst_traits, slp_tree> >
1319 scalar_stmts_to_slp_tree_map_t;
1320
1321 static slp_tree
1322 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1323 vec<stmt_vec_info> stmts, unsigned int group_size,
1324 poly_uint64 *max_nunits,
1325 bool *matches, unsigned *npermutes, unsigned *tree_size,
1326 scalar_stmts_to_slp_tree_map_t *bst_map);
1327
1328 static slp_tree
1329 vect_build_slp_tree (vec_info *vinfo,
1330 vec<stmt_vec_info> stmts, unsigned int group_size,
1331 poly_uint64 *max_nunits,
1332 bool *matches, unsigned *npermutes, unsigned *tree_size,
1333 scalar_stmts_to_slp_tree_map_t *bst_map)
1334 {
1335 if (slp_tree *leader = bst_map->get (stmts))
1336 {
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1339 *leader ? "" : "failed ", *leader);
1340 if (*leader)
1341 {
1342 SLP_TREE_REF_COUNT (*leader)++;
1343 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1344 }
1345 return *leader;
1346 }
1347
1348 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1349 so we can pick up backedge destinations during discovery. */
1350 slp_tree res = new _slp_tree;
1351 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1352 SLP_TREE_SCALAR_STMTS (res) = stmts;
1353 bst_map->put (stmts.copy (), res);
1354
1355 poly_uint64 this_max_nunits = 1;
1356 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1357 &this_max_nunits,
1358 matches, npermutes, tree_size, bst_map);
1359 if (!res_)
1360 {
1361 bool existed_p = bst_map->put (stmts, NULL);
1362 gcc_assert (existed_p);
1363 /* Mark the node invalid so we can detect those when still in use
1364 as backedge destinations. */
1365 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1366 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1367 vect_free_slp_tree (res);
1368 }
1369 else
1370 {
1371 gcc_assert (res_ == res);
1372 res->max_nunits = this_max_nunits;
1373 vect_update_max_nunits (max_nunits, this_max_nunits);
1374 /* Keep a reference for the bst_map use. */
1375 SLP_TREE_REF_COUNT (res)++;
1376 }
1377 return res_;
1378 }
1379
1380 /* Recursively build an SLP tree starting from NODE.
1381 Fail (and return a value not equal to zero) if def-stmts are not
1382 isomorphic, require data permutation or are of unsupported types of
1383 operation. Otherwise, return 0.
1384 The value returned is the depth in the SLP tree where a mismatch
1385 was found. */
1386
1387 static slp_tree
1388 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1389 vec<stmt_vec_info> stmts, unsigned int group_size,
1390 poly_uint64 *max_nunits,
1391 bool *matches, unsigned *npermutes, unsigned *tree_size,
1392 scalar_stmts_to_slp_tree_map_t *bst_map)
1393 {
1394 unsigned nops, i, this_tree_size = 0;
1395 poly_uint64 this_max_nunits = *max_nunits;
1396
1397 matches[0] = false;
1398
1399 stmt_vec_info stmt_info = stmts[0];
1400 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1401 nops = gimple_call_num_args (stmt);
1402 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1403 {
1404 nops = gimple_num_ops (stmt) - 1;
1405 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1406 nops++;
1407 }
1408 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1409 nops = gimple_phi_num_args (phi);
1410 else
1411 return NULL;
1412
1413 /* If the SLP node is a PHI (induction or reduction), terminate
1414 the recursion. */
1415 bool skip_args[2] = { false, false };
1416 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1417 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1418 {
1419 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1420 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1421 group_size);
1422 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1423 max_nunits))
1424 return NULL;
1425
1426 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1427 /* Induction from different IVs is not supported. */
1428 if (def_type == vect_induction_def)
1429 {
1430 stmt_vec_info other_info;
1431 FOR_EACH_VEC_ELT (stmts, i, other_info)
1432 if (stmt_info != other_info)
1433 return NULL;
1434
1435 /* Induction PHIs are leafs. */
1436 (*tree_size)++;
1437 node = vect_create_new_slp_node (node, stmts, nops);
1438 SLP_TREE_VECTYPE (node) = vectype;
1439 SLP_TREE_CHILDREN (node).quick_grow_cleared (nops);
1440 return node;
1441 }
1442 else if (def_type == vect_reduction_def
1443 || def_type == vect_double_reduction_def
1444 || def_type == vect_nested_cycle)
1445 {
1446 /* Else def types have to match. */
1447 stmt_vec_info other_info;
1448 bool all_same = true;
1449 FOR_EACH_VEC_ELT (stmts, i, other_info)
1450 {
1451 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1452 return NULL;
1453 if (other_info != stmt_info)
1454 all_same = false;
1455 }
1456 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1457 /* Reduction initial values are not explicitely represented. */
1458 if (!nested_in_vect_loop_p (loop, stmt_info))
1459 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1460 /* Reduction chain backedge defs are filled manually.
1461 ??? Need a better way to identify a SLP reduction chain PHI.
1462 Or a better overall way to SLP match those. */
1463 if (all_same && def_type == vect_reduction_def)
1464 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1465 }
1466 else if (def_type != vect_internal_def)
1467 return NULL;
1468 }
1469
1470
1471 bool two_operators = false;
1472 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1473 tree vectype = NULL_TREE;
1474 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1475 &this_max_nunits, matches, &two_operators,
1476 &vectype))
1477 return NULL;
1478
1479 /* If the SLP node is a load, terminate the recursion unless masked. */
1480 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1481 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1482 {
1483 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1484 {
1485 /* Masked load. */
1486 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1487 nops = 1;
1488 }
1489 else
1490 {
1491 *max_nunits = this_max_nunits;
1492 (*tree_size)++;
1493 node = vect_create_new_slp_node (node, stmts, 0);
1494 SLP_TREE_VECTYPE (node) = vectype;
1495 /* And compute the load permutation. Whether it is actually
1496 a permutation depends on the unrolling factor which is
1497 decided later. */
1498 vec<unsigned> load_permutation;
1499 int j;
1500 stmt_vec_info load_info;
1501 load_permutation.create (group_size);
1502 stmt_vec_info first_stmt_info
1503 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1504 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1505 {
1506 int load_place = vect_get_place_in_interleaving_chain
1507 (load_info, first_stmt_info);
1508 gcc_assert (load_place != -1);
1509 load_permutation.safe_push (load_place);
1510 }
1511 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1512 return node;
1513 }
1514 }
1515 else if (gimple_assign_single_p (stmt_info->stmt)
1516 && !gimple_vuse (stmt_info->stmt)
1517 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1518 {
1519 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1520 the same SSA name vector of a compatible type to vectype. */
1521 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1522 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1523 stmt_vec_info estmt_info;
1524 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1525 {
1526 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1527 tree bfref = gimple_assign_rhs1 (estmt);
1528 HOST_WIDE_INT lane;
1529 if (!known_eq (bit_field_size (bfref),
1530 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1531 || !constant_multiple_p (bit_field_offset (bfref),
1532 bit_field_size (bfref), &lane))
1533 {
1534 lperm.release ();
1535 return NULL;
1536 }
1537 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1538 }
1539 slp_tree vnode = vect_create_new_slp_node (vNULL);
1540 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1541 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1542 /* We are always building a permutation node even if it is an identity
1543 permute to shield the rest of the vectorizer from the odd node
1544 representing an actual vector without any scalar ops.
1545 ??? We could hide it completely with making the permute node
1546 external? */
1547 node = vect_create_new_slp_node (node, stmts, 1);
1548 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1549 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1550 SLP_TREE_VECTYPE (node) = vectype;
1551 SLP_TREE_CHILDREN (node).quick_push (vnode);
1552 return node;
1553 }
1554
1555 /* Get at the operands, verifying they are compatible. */
1556 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1557 slp_oprnd_info oprnd_info;
1558 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1559 {
1560 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1561 stmts, i, &oprnds_info);
1562 if (res != 0)
1563 matches[(res == -1) ? 0 : i] = false;
1564 if (!matches[0])
1565 break;
1566 }
1567 for (i = 0; i < group_size; ++i)
1568 if (!matches[i])
1569 {
1570 vect_free_oprnd_info (oprnds_info);
1571 return NULL;
1572 }
1573 swap = NULL;
1574
1575 auto_vec<slp_tree, 4> children;
1576
1577 stmt_info = stmts[0];
1578
1579 /* Create SLP_TREE nodes for the definition node/s. */
1580 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1581 {
1582 slp_tree child;
1583 unsigned int j;
1584
1585 if (oprnd_info->first_dt == vect_uninitialized_def)
1586 {
1587 /* COND_EXPR have one too many eventually if the condition
1588 is a SSA name. */
1589 gcc_assert (i == 3 && nops == 4);
1590 continue;
1591 }
1592
1593 /* We're skipping certain operands from processing, for example
1594 outer loop reduction initial defs. */
1595 if (i <= 1 && skip_args[i])
1596 {
1597 children.safe_push (NULL);
1598 continue;
1599 }
1600
1601 if (is_a <bb_vec_info> (vinfo)
1602 && oprnd_info->first_dt == vect_internal_def)
1603 {
1604 /* For BB vectorization, if all defs are the same do not
1605 bother to continue the build along the single-lane
1606 graph but use a splat of the scalar value. */
1607 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1608 for (j = 1; j < group_size; ++j)
1609 if (oprnd_info->def_stmts[j] != first_def)
1610 break;
1611 if (j == group_size
1612 /* But avoid doing this for loads where we may be
1613 able to CSE things. */
1614 && !gimple_vuse (first_def->stmt))
1615 {
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_NOTE, vect_location,
1618 "Using a splat of the uniform operand\n");
1619 oprnd_info->first_dt = vect_external_def;
1620 }
1621 }
1622
1623 if (oprnd_info->first_dt == vect_external_def
1624 || oprnd_info->first_dt == vect_constant_def)
1625 {
1626 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1627 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1628 oprnd_info->ops = vNULL;
1629 children.safe_push (invnode);
1630 continue;
1631 }
1632
1633 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1634 group_size, &this_max_nunits,
1635 matches, npermutes,
1636 &this_tree_size, bst_map)) != NULL)
1637 {
1638 oprnd_info->def_stmts = vNULL;
1639 children.safe_push (child);
1640 continue;
1641 }
1642
1643 /* If the SLP build for operand zero failed and operand zero
1644 and one can be commutated try that for the scalar stmts
1645 that failed the match. */
1646 if (i == 0
1647 /* A first scalar stmt mismatch signals a fatal mismatch. */
1648 && matches[0]
1649 /* ??? For COND_EXPRs we can swap the comparison operands
1650 as well as the arms under some constraints. */
1651 && nops == 2
1652 && oprnds_info[1]->first_dt == vect_internal_def
1653 && is_gimple_assign (stmt_info->stmt)
1654 /* Swapping operands for reductions breaks assumptions later on. */
1655 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1656 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1657 /* Do so only if the number of not successful permutes was nor more
1658 than a cut-ff as re-trying the recursive match on
1659 possibly each level of the tree would expose exponential
1660 behavior. */
1661 && *npermutes < 4)
1662 {
1663 /* See whether we can swap the matching or the non-matching
1664 stmt operands. */
1665 bool swap_not_matching = true;
1666 do
1667 {
1668 for (j = 0; j < group_size; ++j)
1669 {
1670 if (matches[j] != !swap_not_matching)
1671 continue;
1672 stmt_vec_info stmt_info = stmts[j];
1673 /* Verify if we can swap operands of this stmt. */
1674 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1675 if (!stmt
1676 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1677 {
1678 if (!swap_not_matching)
1679 goto fail;
1680 swap_not_matching = false;
1681 break;
1682 }
1683 }
1684 }
1685 while (j != group_size);
1686
1687 /* Swap mismatched definition stmts. */
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_NOTE, vect_location,
1690 "Re-trying with swapped operands of stmts ");
1691 for (j = 0; j < group_size; ++j)
1692 if (matches[j] == !swap_not_matching)
1693 {
1694 std::swap (oprnds_info[0]->def_stmts[j],
1695 oprnds_info[1]->def_stmts[j]);
1696 std::swap (oprnds_info[0]->ops[j],
1697 oprnds_info[1]->ops[j]);
1698 if (dump_enabled_p ())
1699 dump_printf (MSG_NOTE, "%d ", j);
1700 }
1701 if (dump_enabled_p ())
1702 dump_printf (MSG_NOTE, "\n");
1703 /* And try again with scratch 'matches' ... */
1704 bool *tem = XALLOCAVEC (bool, group_size);
1705 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1706 group_size, &this_max_nunits,
1707 tem, npermutes,
1708 &this_tree_size, bst_map)) != NULL)
1709 {
1710 oprnd_info->def_stmts = vNULL;
1711 children.safe_push (child);
1712 continue;
1713 }
1714 /* We do not undo the swapping here since it might still be
1715 the better order for the second operand in case we build
1716 the first one from scalars below. */
1717 ++*npermutes;
1718 }
1719 fail:
1720
1721 /* If the SLP build failed and we analyze a basic-block
1722 simply treat nodes we fail to build as externally defined
1723 (and thus build vectors from the scalar defs).
1724 The cost model will reject outright expensive cases.
1725 ??? This doesn't treat cases where permutation ultimatively
1726 fails (or we don't try permutation below). Ideally we'd
1727 even compute a permutation that will end up with the maximum
1728 SLP tree size... */
1729 if (is_a <bb_vec_info> (vinfo)
1730 /* ??? Rejecting patterns this way doesn't work. We'd have to
1731 do extra work to cancel the pattern so the uses see the
1732 scalar version. */
1733 && !is_pattern_stmt_p (stmt_info)
1734 && !oprnd_info->any_pattern)
1735 {
1736 /* But if there's a leading vector sized set of matching stmts
1737 fail here so we can split the group. This matches the condition
1738 vect_analyze_slp_instance uses. */
1739 /* ??? We might want to split here and combine the results to support
1740 multiple vector sizes better. */
1741 for (j = 0; j < group_size; ++j)
1742 if (!matches[j])
1743 break;
1744 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1745 {
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Building vector operands from scalars\n");
1749 this_tree_size++;
1750 child = vect_create_new_slp_node (oprnd_info->ops);
1751 children.safe_push (child);
1752 oprnd_info->ops = vNULL;
1753 continue;
1754 }
1755 }
1756
1757 gcc_assert (child == NULL);
1758 FOR_EACH_VEC_ELT (children, j, child)
1759 if (child)
1760 vect_free_slp_tree (child);
1761 vect_free_oprnd_info (oprnds_info);
1762 return NULL;
1763 }
1764
1765 vect_free_oprnd_info (oprnds_info);
1766
1767 /* If we have all children of a child built up from uniform scalars
1768 or does more than one possibly expensive vector construction then
1769 just throw that away, causing it built up from scalars.
1770 The exception is the SLP node for the vector store. */
1771 if (is_a <bb_vec_info> (vinfo)
1772 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1773 /* ??? Rejecting patterns this way doesn't work. We'd have to
1774 do extra work to cancel the pattern so the uses see the
1775 scalar version. */
1776 && !is_pattern_stmt_p (stmt_info))
1777 {
1778 slp_tree child;
1779 unsigned j;
1780 bool all_uniform_p = true;
1781 unsigned n_vector_builds = 0;
1782 FOR_EACH_VEC_ELT (children, j, child)
1783 {
1784 if (!child)
1785 ;
1786 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1787 all_uniform_p = false;
1788 else if (!vect_slp_tree_uniform_p (child))
1789 {
1790 all_uniform_p = false;
1791 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1792 n_vector_builds++;
1793 }
1794 }
1795 if (all_uniform_p || n_vector_builds > 1)
1796 {
1797 /* Roll back. */
1798 matches[0] = false;
1799 FOR_EACH_VEC_ELT (children, j, child)
1800 if (child)
1801 vect_free_slp_tree (child);
1802
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_NOTE, vect_location,
1805 "Building parent vector operands from "
1806 "scalars instead\n");
1807 return NULL;
1808 }
1809 }
1810
1811 *tree_size += this_tree_size + 1;
1812 *max_nunits = this_max_nunits;
1813
1814 if (two_operators)
1815 {
1816 /* ??? We'd likely want to either cache in bst_map sth like
1817 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1818 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1819 explicit stmts to put in so the keying on 'stmts' doesn't
1820 work (but we have the same issue with nodes that use 'ops'). */
1821 slp_tree one = new _slp_tree;
1822 slp_tree two = new _slp_tree;
1823 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1824 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1825 SLP_TREE_VECTYPE (one) = vectype;
1826 SLP_TREE_VECTYPE (two) = vectype;
1827 SLP_TREE_CHILDREN (one).safe_splice (children);
1828 SLP_TREE_CHILDREN (two).safe_splice (children);
1829 slp_tree child;
1830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1831 SLP_TREE_REF_COUNT (child)++;
1832
1833 /* Here we record the original defs since this
1834 node represents the final lane configuration. */
1835 node = vect_create_new_slp_node (node, stmts, 2);
1836 SLP_TREE_VECTYPE (node) = vectype;
1837 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1838 SLP_TREE_CHILDREN (node).quick_push (one);
1839 SLP_TREE_CHILDREN (node).quick_push (two);
1840 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1841 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1842 enum tree_code ocode = ERROR_MARK;
1843 stmt_vec_info ostmt_info;
1844 unsigned j = 0;
1845 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1846 {
1847 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1848 if (gimple_assign_rhs_code (ostmt) != code0)
1849 {
1850 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1851 ocode = gimple_assign_rhs_code (ostmt);
1852 j = i;
1853 }
1854 else
1855 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1856 }
1857 SLP_TREE_CODE (one) = code0;
1858 SLP_TREE_CODE (two) = ocode;
1859 SLP_TREE_LANES (one) = stmts.length ();
1860 SLP_TREE_LANES (two) = stmts.length ();
1861 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1862 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1863 return node;
1864 }
1865
1866 node = vect_create_new_slp_node (node, stmts, nops);
1867 SLP_TREE_VECTYPE (node) = vectype;
1868 SLP_TREE_CHILDREN (node).splice (children);
1869 return node;
1870 }
1871
1872 /* Dump a single SLP tree NODE. */
1873
1874 static void
1875 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1876 slp_tree node)
1877 {
1878 unsigned i, j;
1879 slp_tree child;
1880 stmt_vec_info stmt_info;
1881 tree op;
1882
1883 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1884 dump_user_location_t user_loc = loc.get_user_location ();
1885 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1886 SLP_TREE_DEF_TYPE (node) == vect_external_def
1887 ? " (external)"
1888 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1889 ? " (constant)"
1890 : ""), node,
1891 estimated_poly_value (node->max_nunits),
1892 SLP_TREE_REF_COUNT (node));
1893 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1894 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1895 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1896 else
1897 {
1898 dump_printf_loc (metadata, user_loc, "\t{ ");
1899 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1900 dump_printf (metadata, "%T%s ", op,
1901 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1902 dump_printf (metadata, "}\n");
1903 }
1904 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1905 {
1906 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1907 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1908 dump_printf (dump_kind, " %u", j);
1909 dump_printf (dump_kind, " }\n");
1910 }
1911 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1912 {
1913 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1914 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1915 dump_printf (dump_kind, " %u[%u]",
1916 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1917 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1918 dump_printf (dump_kind, " }\n");
1919 }
1920 if (SLP_TREE_CHILDREN (node).is_empty ())
1921 return;
1922 dump_printf_loc (metadata, user_loc, "\tchildren");
1923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1924 dump_printf (dump_kind, " %p", (void *)child);
1925 dump_printf (dump_kind, "\n");
1926 }
1927
1928 DEBUG_FUNCTION void
1929 debug (slp_tree node)
1930 {
1931 debug_dump_context ctx;
1932 vect_print_slp_tree (MSG_NOTE,
1933 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1934 node);
1935 }
1936
1937 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1938
1939 static void
1940 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1941 slp_tree node, hash_set<slp_tree> &visited)
1942 {
1943 unsigned i;
1944 slp_tree child;
1945
1946 if (visited.add (node))
1947 return;
1948
1949 vect_print_slp_tree (dump_kind, loc, node);
1950
1951 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1952 if (child)
1953 vect_print_slp_graph (dump_kind, loc, child, visited);
1954 }
1955
1956 static void
1957 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1958 slp_tree entry)
1959 {
1960 hash_set<slp_tree> visited;
1961 vect_print_slp_graph (dump_kind, loc, entry, visited);
1962 }
1963
1964 /* Mark the tree rooted at NODE with PURE_SLP. */
1965
1966 static void
1967 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1968 {
1969 int i;
1970 stmt_vec_info stmt_info;
1971 slp_tree child;
1972
1973 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1974 return;
1975
1976 if (visited.add (node))
1977 return;
1978
1979 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1980 STMT_SLP_TYPE (stmt_info) = pure_slp;
1981
1982 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1983 if (child)
1984 vect_mark_slp_stmts (child, visited);
1985 }
1986
1987 static void
1988 vect_mark_slp_stmts (slp_tree node)
1989 {
1990 hash_set<slp_tree> visited;
1991 vect_mark_slp_stmts (node, visited);
1992 }
1993
1994 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1995
1996 static void
1997 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1998 {
1999 int i;
2000 stmt_vec_info stmt_info;
2001 slp_tree child;
2002
2003 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2004 return;
2005
2006 if (visited.add (node))
2007 return;
2008
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2010 {
2011 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2012 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2013 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2014 }
2015
2016 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2017 if (child)
2018 vect_mark_slp_stmts_relevant (child, visited);
2019 }
2020
2021 static void
2022 vect_mark_slp_stmts_relevant (slp_tree node)
2023 {
2024 hash_set<slp_tree> visited;
2025 vect_mark_slp_stmts_relevant (node, visited);
2026 }
2027
2028
2029 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2030
2031 static void
2032 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2033 hash_set<slp_tree> &visited)
2034 {
2035 if (!node || visited.add (node))
2036 return;
2037
2038 if (SLP_TREE_CHILDREN (node).length () == 0)
2039 {
2040 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2041 return;
2042 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2043 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2044 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2045 loads.safe_push (node);
2046 }
2047 else
2048 {
2049 unsigned i;
2050 slp_tree child;
2051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2052 vect_gather_slp_loads (loads, child, visited);
2053 }
2054 }
2055
2056 static void
2057 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2058 {
2059 hash_set<slp_tree> visited;
2060 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2061 }
2062
2063
2064 /* Find the last store in SLP INSTANCE. */
2065
2066 stmt_vec_info
2067 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2068 {
2069 stmt_vec_info last = NULL;
2070 stmt_vec_info stmt_vinfo;
2071
2072 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2073 {
2074 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2075 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2076 }
2077
2078 return last;
2079 }
2080
2081 /* Find the first stmt in NODE. */
2082
2083 stmt_vec_info
2084 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2085 {
2086 stmt_vec_info first = NULL;
2087 stmt_vec_info stmt_vinfo;
2088
2089 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2090 {
2091 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2092 if (!first
2093 || get_later_stmt (stmt_vinfo, first) == first)
2094 first = stmt_vinfo;
2095 }
2096
2097 return first;
2098 }
2099
2100 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2101 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2102 (also containing the first GROUP1_SIZE stmts, since stores are
2103 consecutive), the second containing the remainder.
2104 Return the first stmt in the second group. */
2105
2106 static stmt_vec_info
2107 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2108 {
2109 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2110 gcc_assert (group1_size > 0);
2111 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2112 gcc_assert (group2_size > 0);
2113 DR_GROUP_SIZE (first_vinfo) = group1_size;
2114
2115 stmt_vec_info stmt_info = first_vinfo;
2116 for (unsigned i = group1_size; i > 1; i--)
2117 {
2118 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2119 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2120 }
2121 /* STMT is now the last element of the first group. */
2122 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2123 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2124
2125 DR_GROUP_SIZE (group2) = group2_size;
2126 for (stmt_info = group2; stmt_info;
2127 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2128 {
2129 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2130 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2131 }
2132
2133 /* For the second group, the DR_GROUP_GAP is that before the original group,
2134 plus skipping over the first vector. */
2135 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2136
2137 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2138 DR_GROUP_GAP (first_vinfo) += group2_size;
2139
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2142 group1_size, group2_size);
2143
2144 return group2;
2145 }
2146
2147 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2148 statements and a vector of NUNITS elements. */
2149
2150 static poly_uint64
2151 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2152 {
2153 return exact_div (common_multiple (nunits, group_size), group_size);
2154 }
2155
2156 enum slp_instance_kind {
2157 slp_inst_kind_store,
2158 slp_inst_kind_reduc_group,
2159 slp_inst_kind_reduc_chain,
2160 slp_inst_kind_ctor
2161 };
2162
2163 static bool
2164 vect_analyze_slp_instance (vec_info *vinfo,
2165 scalar_stmts_to_slp_tree_map_t *bst_map,
2166 stmt_vec_info stmt_info, unsigned max_tree_size);
2167
2168 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2169 of KIND. Return true if successful. */
2170
2171 static bool
2172 vect_build_slp_instance (vec_info *vinfo,
2173 slp_instance_kind kind,
2174 vec<stmt_vec_info> scalar_stmts,
2175 stmt_vec_info root_stmt_info,
2176 unsigned max_tree_size,
2177 scalar_stmts_to_slp_tree_map_t *bst_map,
2178 /* ??? We need stmt_info for group splitting. */
2179 stmt_vec_info stmt_info_)
2180 {
2181 if (dump_enabled_p ())
2182 {
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Starting SLP discovery for\n");
2185 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 " %G", scalar_stmts[i]->stmt);
2188 }
2189
2190 /* Build the tree for the SLP instance. */
2191 unsigned int group_size = scalar_stmts.length ();
2192 bool *matches = XALLOCAVEC (bool, group_size);
2193 unsigned npermutes = 0;
2194 poly_uint64 max_nunits = 1;
2195 unsigned tree_size = 0;
2196 unsigned i;
2197 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2198 &max_nunits, matches, &npermutes,
2199 &tree_size, bst_map);
2200 if (node != NULL)
2201 {
2202 /* Calculate the unrolling factor based on the smallest type. */
2203 poly_uint64 unrolling_factor
2204 = calculate_unrolling_factor (max_nunits, group_size);
2205
2206 if (maybe_ne (unrolling_factor, 1U)
2207 && is_a <bb_vec_info> (vinfo))
2208 {
2209 unsigned HOST_WIDE_INT const_max_nunits;
2210 if (!max_nunits.is_constant (&const_max_nunits)
2211 || const_max_nunits > group_size)
2212 {
2213 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2215 "Build SLP failed: store group "
2216 "size not a multiple of the vector size "
2217 "in basic block SLP\n");
2218 vect_free_slp_tree (node);
2219 return false;
2220 }
2221 /* Fatal mismatch. */
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "SLP discovery succeeded but node needs "
2225 "splitting\n");
2226 memset (matches, true, group_size);
2227 matches[group_size / const_max_nunits * const_max_nunits] = false;
2228 vect_free_slp_tree (node);
2229 }
2230 else
2231 {
2232 /* Create a new SLP instance. */
2233 slp_instance new_instance = XNEW (class _slp_instance);
2234 SLP_INSTANCE_TREE (new_instance) = node;
2235 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2236 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2237 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2238 new_instance->reduc_phis = NULL;
2239 new_instance->cost_vec = vNULL;
2240 new_instance->subgraph_entries = vNULL;
2241
2242 vect_gather_slp_loads (new_instance, node);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE, vect_location,
2245 "SLP size %u vs. limit %u.\n",
2246 tree_size, max_tree_size);
2247
2248 /* Check whether any load is possibly permuted. */
2249 slp_tree load_node;
2250 bool loads_permuted = false;
2251 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2252 {
2253 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2254 continue;
2255 unsigned j;
2256 stmt_vec_info load_info;
2257 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2258 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2259 {
2260 loads_permuted = true;
2261 break;
2262 }
2263 }
2264
2265 /* If the loads and stores can be handled with load/store-lane
2266 instructions do not generate this SLP instance. */
2267 if (is_a <loop_vec_info> (vinfo)
2268 && loads_permuted
2269 && kind == slp_inst_kind_store
2270 && vect_store_lanes_supported
2271 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2272 {
2273 slp_tree load_node;
2274 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2275 {
2276 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2277 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2278 /* Use SLP for strided accesses (or if we can't load-lanes). */
2279 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2280 || ! vect_load_lanes_supported
2281 (STMT_VINFO_VECTYPE (stmt_vinfo),
2282 DR_GROUP_SIZE (stmt_vinfo), false))
2283 break;
2284 }
2285 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2286 {
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "Built SLP cancelled: can use "
2290 "load/store-lanes\n");
2291 vect_free_slp_instance (new_instance);
2292 return false;
2293 }
2294 }
2295
2296 /* Fixup SLP reduction chains. */
2297 if (kind == slp_inst_kind_reduc_chain)
2298 {
2299 /* If this is a reduction chain with a conversion in front
2300 amend the SLP tree with a node for that. */
2301 gimple *scalar_def
2302 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2303 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2304 {
2305 /* Get at the conversion stmt - we know it's the single use
2306 of the last stmt of the reduction chain. */
2307 use_operand_p use_p;
2308 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2309 &use_p, &scalar_def);
2310 gcc_assert (r);
2311 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2312 next_info = vect_stmt_to_vectorize (next_info);
2313 scalar_stmts = vNULL;
2314 scalar_stmts.create (group_size);
2315 for (unsigned i = 0; i < group_size; ++i)
2316 scalar_stmts.quick_push (next_info);
2317 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2318 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2319 SLP_TREE_CHILDREN (conv).quick_push (node);
2320 SLP_INSTANCE_TREE (new_instance) = conv;
2321 /* We also have to fake this conversion stmt as SLP reduction
2322 group so we don't have to mess with too much code
2323 elsewhere. */
2324 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2325 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2326 }
2327 /* Fill the backedge child of the PHI SLP node. The
2328 general matching code cannot find it because the
2329 scalar code does not reflect how we vectorize the
2330 reduction. */
2331 use_operand_p use_p;
2332 imm_use_iterator imm_iter;
2333 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2335 gimple_get_lhs (scalar_def))
2336 /* There are exactly two non-debug uses, the reduction
2337 PHI and the loop-closed PHI node. */
2338 if (!is_gimple_debug (USE_STMT (use_p))
2339 && gimple_bb (USE_STMT (use_p)) == loop->header)
2340 {
2341 auto_vec<stmt_vec_info, 64> phis (group_size);
2342 stmt_vec_info phi_info
2343 = vinfo->lookup_stmt (USE_STMT (use_p));
2344 for (unsigned i = 0; i < group_size; ++i)
2345 phis.quick_push (phi_info);
2346 slp_tree *phi_node = bst_map->get (phis);
2347 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2348 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2349 = SLP_INSTANCE_TREE (new_instance);
2350 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2351 }
2352 }
2353
2354 vinfo->slp_instances.safe_push (new_instance);
2355
2356 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2357 the number of scalar stmts in the root in a few places.
2358 Verify that assumption holds. */
2359 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2360 .length () == group_size);
2361
2362 if (dump_enabled_p ())
2363 {
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Final SLP tree for instance:\n");
2366 vect_print_slp_graph (MSG_NOTE, vect_location,
2367 SLP_INSTANCE_TREE (new_instance));
2368 }
2369
2370 return true;
2371 }
2372 }
2373 else
2374 {
2375 /* Failed to SLP. */
2376 /* Free the allocated memory. */
2377 scalar_stmts.release ();
2378 }
2379
2380 stmt_vec_info stmt_info = stmt_info_;
2381 /* Try to break the group up into pieces. */
2382 if (kind == slp_inst_kind_store)
2383 {
2384 /* ??? We could delay all the actual splitting of store-groups
2385 until after SLP discovery of the original group completed.
2386 Then we can recurse to vect_build_slp_instance directly. */
2387 for (i = 0; i < group_size; i++)
2388 if (!matches[i])
2389 break;
2390
2391 /* For basic block SLP, try to break the group up into multiples of
2392 a vector size. */
2393 if (is_a <bb_vec_info> (vinfo)
2394 && (i > 1 && i < group_size))
2395 {
2396 tree scalar_type
2397 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2398 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2399 1 << floor_log2 (i));
2400 unsigned HOST_WIDE_INT const_nunits;
2401 if (vectype
2402 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2403 {
2404 /* Split into two groups at the first vector boundary. */
2405 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2406 unsigned group1_size = i & ~(const_nunits - 1);
2407
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE, vect_location,
2410 "Splitting SLP group at stmt %u\n", i);
2411 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2412 group1_size);
2413 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2414 max_tree_size);
2415 /* Split the rest at the failure point and possibly
2416 re-analyze the remaining matching part if it has
2417 at least two lanes. */
2418 if (group1_size < i
2419 && (i + 1 < group_size
2420 || i - group1_size > 1))
2421 {
2422 stmt_vec_info rest2 = rest;
2423 rest = vect_split_slp_store_group (rest, i - group1_size);
2424 if (i - group1_size > 1)
2425 res |= vect_analyze_slp_instance (vinfo, bst_map,
2426 rest2, max_tree_size);
2427 }
2428 /* Re-analyze the non-matching tail if it has at least
2429 two lanes. */
2430 if (i + 1 < group_size)
2431 res |= vect_analyze_slp_instance (vinfo, bst_map,
2432 rest, max_tree_size);
2433 return res;
2434 }
2435 }
2436
2437 /* For loop vectorization split into arbitrary pieces of size > 1. */
2438 if (is_a <loop_vec_info> (vinfo)
2439 && (i > 1 && i < group_size))
2440 {
2441 unsigned group1_size = i;
2442
2443 if (dump_enabled_p ())
2444 dump_printf_loc (MSG_NOTE, vect_location,
2445 "Splitting SLP group at stmt %u\n", i);
2446
2447 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2448 group1_size);
2449 /* Loop vectorization cannot handle gaps in stores, make sure
2450 the split group appears as strided. */
2451 STMT_VINFO_STRIDED_P (rest) = 1;
2452 DR_GROUP_GAP (rest) = 0;
2453 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2454 DR_GROUP_GAP (stmt_info) = 0;
2455
2456 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2457 max_tree_size);
2458 if (i + 1 < group_size)
2459 res |= vect_analyze_slp_instance (vinfo, bst_map,
2460 rest, max_tree_size);
2461
2462 return res;
2463 }
2464
2465 /* Even though the first vector did not all match, we might be able to SLP
2466 (some) of the remainder. FORNOW ignore this possibility. */
2467 }
2468
2469 /* Failed to SLP. */
2470 if (dump_enabled_p ())
2471 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2472 return false;
2473 }
2474
2475
2476 /* Analyze an SLP instance starting from a group of grouped stores. Call
2477 vect_build_slp_tree to build a tree of packed stmts if possible.
2478 Return FALSE if it's impossible to SLP any stmt in the loop. */
2479
2480 static bool
2481 vect_analyze_slp_instance (vec_info *vinfo,
2482 scalar_stmts_to_slp_tree_map_t *bst_map,
2483 stmt_vec_info stmt_info, unsigned max_tree_size)
2484 {
2485 unsigned int group_size;
2486 unsigned int i;
2487 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2488 vec<stmt_vec_info> scalar_stmts;
2489 slp_instance_kind kind;
2490
2491 if (is_a <bb_vec_info> (vinfo))
2492 vect_location = stmt_info->stmt;
2493 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2494 {
2495 kind = slp_inst_kind_store;
2496 group_size = DR_GROUP_SIZE (stmt_info);
2497 }
2498 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2499 {
2500 kind = slp_inst_kind_reduc_chain;
2501 gcc_assert (is_a <loop_vec_info> (vinfo));
2502 group_size = REDUC_GROUP_SIZE (stmt_info);
2503 }
2504 else if (is_gimple_assign (stmt_info->stmt)
2505 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2506 {
2507 kind = slp_inst_kind_ctor;
2508 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2509 }
2510 else
2511 {
2512 kind = slp_inst_kind_reduc_group;
2513 gcc_assert (is_a <loop_vec_info> (vinfo));
2514 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2515 }
2516
2517 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2518 scalar_stmts.create (group_size);
2519 stmt_vec_info next_info = stmt_info;
2520 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2521 {
2522 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2523 while (next_info)
2524 {
2525 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2526 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2527 }
2528 }
2529 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2530 {
2531 /* Collect the reduction stmts and store them in
2532 SLP_TREE_SCALAR_STMTS. */
2533 while (next_info)
2534 {
2535 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2536 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2537 }
2538 /* Mark the first element of the reduction chain as reduction to properly
2539 transform the node. In the reduction analysis phase only the last
2540 element of the chain is marked as reduction. */
2541 STMT_VINFO_DEF_TYPE (stmt_info)
2542 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2543 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2544 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2545 }
2546 else if (kind == slp_inst_kind_ctor)
2547 {
2548 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2549 tree val;
2550 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2551 {
2552 if (TREE_CODE (val) == SSA_NAME)
2553 {
2554 gimple* def = SSA_NAME_DEF_STMT (val);
2555 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2556 /* Value is defined in another basic block. */
2557 if (!def_info)
2558 return false;
2559 def_info = vect_stmt_to_vectorize (def_info);
2560 scalar_stmts.safe_push (def_info);
2561 }
2562 else
2563 return false;
2564 }
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE, vect_location,
2567 "Analyzing vectorizable constructor: %G\n",
2568 stmt_info->stmt);
2569 }
2570 else
2571 {
2572 /* Collect reduction statements. */
2573 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2574 for (i = 0; reductions.iterate (i, &next_info); i++)
2575 scalar_stmts.safe_push (next_info);
2576 }
2577
2578 /* Build the tree for the SLP instance. */
2579 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2580 kind == slp_inst_kind_ctor
2581 ? stmt_info : NULL,
2582 max_tree_size,
2583 bst_map, stmt_info);
2584
2585 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2586 where we should do store group splitting. */
2587
2588 return res;
2589 }
2590
2591 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2592 trees of packed scalar stmts if SLP is possible. */
2593
2594 opt_result
2595 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2596 {
2597 unsigned int i;
2598 stmt_vec_info first_element;
2599
2600 DUMP_VECT_SCOPE ("vect_analyze_slp");
2601
2602 scalar_stmts_to_slp_tree_map_t *bst_map
2603 = new scalar_stmts_to_slp_tree_map_t ();
2604
2605 /* Find SLP sequences starting from groups of grouped stores. */
2606 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2607 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2608
2609 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2610 {
2611 if (loop_vinfo->reduction_chains.length () > 0)
2612 {
2613 /* Find SLP sequences starting from reduction chains. */
2614 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2615 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2616 max_tree_size))
2617 {
2618 /* Dissolve reduction chain group. */
2619 stmt_vec_info vinfo = first_element;
2620 stmt_vec_info last = NULL;
2621 while (vinfo)
2622 {
2623 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2624 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2625 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2626 last = vinfo;
2627 vinfo = next;
2628 }
2629 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2630 /* It can be still vectorized as part of an SLP reduction. */
2631 loop_vinfo->reductions.safe_push (last);
2632 }
2633 }
2634
2635 /* Find SLP sequences starting from groups of reductions. */
2636 if (loop_vinfo->reductions.length () > 1)
2637 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2638 max_tree_size);
2639 }
2640
2641 /* The map keeps a reference on SLP nodes built, release that. */
2642 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2643 it != bst_map->end (); ++it)
2644 if ((*it).second)
2645 vect_free_slp_tree ((*it).second);
2646 delete bst_map;
2647
2648 return opt_result::success ();
2649 }
2650
2651 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2652
2653 static void
2654 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2655 vec<slp_tree> &vertices, vec<int> &leafs)
2656 {
2657 unsigned i;
2658 slp_tree child;
2659
2660 if (visited.add (node))
2661 return;
2662
2663 node->vertex = vertices.length ();
2664 vertices.safe_push (node);
2665
2666 bool leaf = true;
2667 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2668 if (child)
2669 {
2670 leaf = false;
2671 vect_slp_build_vertices (visited, child, vertices, leafs);
2672 }
2673 if (leaf)
2674 leafs.safe_push (node->vertex);
2675 }
2676
2677 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2678
2679 static void
2680 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2681 vec<int> &leafs)
2682 {
2683 hash_set<slp_tree> visited;
2684 unsigned i;
2685 slp_instance instance;
2686 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2687 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2688 leafs);
2689 }
2690
2691 /* Apply (reverse) bijectite PERM to VEC. */
2692
2693 template <class T>
2694 static void
2695 vect_slp_permute (vec<unsigned> perm,
2696 vec<T> &vec, bool reverse)
2697 {
2698 auto_vec<T, 64> saved;
2699 saved.create (vec.length ());
2700 for (unsigned i = 0; i < vec.length (); ++i)
2701 saved.quick_push (vec[i]);
2702
2703 if (reverse)
2704 {
2705 for (unsigned i = 0; i < vec.length (); ++i)
2706 vec[perm[i]] = saved[i];
2707 for (unsigned i = 0; i < vec.length (); ++i)
2708 gcc_assert (vec[perm[i]] == saved[i]);
2709 }
2710 else
2711 {
2712 for (unsigned i = 0; i < vec.length (); ++i)
2713 vec[i] = saved[perm[i]];
2714 for (unsigned i = 0; i < vec.length (); ++i)
2715 gcc_assert (vec[i] == saved[perm[i]]);
2716 }
2717 }
2718
2719 /* Return whether permutations PERM_A and PERM_B as recorded in the
2720 PERMS vector are equal. */
2721
2722 static bool
2723 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2724 int perm_a, int perm_b)
2725 {
2726 return (perm_a == perm_b
2727 || (perms[perm_a].length () == perms[perm_b].length ()
2728 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2729 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2730 }
2731
2732 /* Optimize the SLP graph of VINFO. */
2733
2734 void
2735 vect_optimize_slp (vec_info *vinfo)
2736 {
2737 if (vinfo->slp_instances.is_empty ())
2738 return;
2739
2740 slp_tree node;
2741 unsigned i;
2742 auto_vec<slp_tree> vertices;
2743 auto_vec<int> leafs;
2744 vect_slp_build_vertices (vinfo, vertices, leafs);
2745
2746 struct graph *slpg = new_graph (vertices.length ());
2747 FOR_EACH_VEC_ELT (vertices, i, node)
2748 {
2749 unsigned j;
2750 slp_tree child;
2751 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2752 if (child)
2753 add_edge (slpg, i, child->vertex);
2754 }
2755
2756 /* Compute (reverse) postorder on the inverted graph. */
2757 auto_vec<int> ipo;
2758 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2759
2760 auto_sbitmap n_visited (vertices.length ());
2761 auto_sbitmap n_materialize (vertices.length ());
2762 auto_vec<int> n_perm (vertices.length ());
2763 auto_vec<vec<unsigned> > perms;
2764
2765 bitmap_clear (n_visited);
2766 bitmap_clear (n_materialize);
2767 n_perm.quick_grow_cleared (vertices.length ());
2768 perms.safe_push (vNULL); /* zero is no permute */
2769
2770 /* Produce initial permutations. */
2771 for (i = 0; i < leafs.length (); ++i)
2772 {
2773 int idx = leafs[i];
2774 slp_tree node = vertices[idx];
2775
2776 /* Handle externals and constants optimistically throughout the
2777 iteration. */
2778 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2779 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2780 continue;
2781
2782 /* Loads are the only thing generating permutes and leafs do not
2783 change across iterations. */
2784 bitmap_set_bit (n_visited, idx);
2785 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2786 continue;
2787
2788 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2789 node unpermuted, record this permute. */
2790 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2791 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2792 continue;
2793 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2794 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2795 bool any_permute = false;
2796 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2797 {
2798 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2799 imin = MIN (imin, idx);
2800 imax = MAX (imax, idx);
2801 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2802 any_permute = true;
2803 }
2804 /* If there's no permute no need to split one out. */
2805 if (!any_permute)
2806 continue;
2807 /* If the span doesn't match we'd disrupt VF computation, avoid
2808 that for now. */
2809 if (imax - imin + 1 != SLP_TREE_LANES (node))
2810 continue;
2811
2812 /* For now only handle true permutes, like
2813 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2814 when permuting constants and invariants keeping the permute
2815 bijective. */
2816 auto_sbitmap load_index (SLP_TREE_LANES (node));
2817 bitmap_clear (load_index);
2818 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2819 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2820 unsigned j;
2821 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2822 if (!bitmap_bit_p (load_index, j))
2823 break;
2824 if (j != SLP_TREE_LANES (node))
2825 continue;
2826
2827 vec<unsigned> perm = vNULL;
2828 perm.safe_grow (SLP_TREE_LANES (node), true);
2829 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2830 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2831 perms.safe_push (perm);
2832 n_perm[idx] = perms.length () - 1;
2833 }
2834
2835 /* Propagate permutes along the graph and compute materialization points. */
2836 bool changed;
2837 unsigned iteration = 0;
2838 do
2839 {
2840 changed = false;
2841 ++iteration;
2842
2843 for (i = vertices.length (); i > 0 ; --i)
2844 {
2845 int idx = ipo[i-1];
2846 slp_tree node = vertices[idx];
2847 /* For leafs there's nothing to do - we've seeded permutes
2848 on those above. */
2849 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2850 continue;
2851
2852 bitmap_set_bit (n_visited, idx);
2853
2854 /* We cannot move a permute across a store. */
2855 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2856 && DR_IS_WRITE
2857 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2858 continue;
2859
2860 int perm = -1;
2861 for (graph_edge *succ = slpg->vertices[idx].succ;
2862 succ; succ = succ->succ_next)
2863 {
2864 int succ_idx = succ->dest;
2865 /* Handle unvisited nodes optimistically. */
2866 /* ??? But for constants once we want to handle non-bijective
2867 permutes we have to verify the permute, when unifying lanes,
2868 will not unify different constants. For example see
2869 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2870 if (!bitmap_bit_p (n_visited, succ_idx))
2871 continue;
2872 int succ_perm = n_perm[succ_idx];
2873 /* Once we materialize succs permutation its output lanes
2874 appear unpermuted to us. */
2875 if (bitmap_bit_p (n_materialize, succ_idx))
2876 succ_perm = 0;
2877 if (perm == -1)
2878 perm = succ_perm;
2879 else if (succ_perm == 0)
2880 {
2881 perm = 0;
2882 break;
2883 }
2884 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2885 {
2886 perm = 0;
2887 break;
2888 }
2889 }
2890
2891 if (perm == -1)
2892 /* Pick up pre-computed leaf values. */
2893 perm = n_perm[idx];
2894 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2895 {
2896 if (iteration > 1)
2897 /* Make sure we eventually converge. */
2898 gcc_checking_assert (perm == 0);
2899 n_perm[idx] = perm;
2900 if (perm == 0)
2901 bitmap_clear_bit (n_materialize, idx);
2902 changed = true;
2903 }
2904
2905 if (perm == 0)
2906 continue;
2907
2908 /* Elide pruning at materialization points in the first
2909 iteration so every node was visited once at least. */
2910 if (iteration == 1)
2911 continue;
2912
2913 /* Decide on permute materialization. Look whether there's
2914 a use (pred) edge that is permuted differently than us.
2915 In that case mark ourselves so the permutation is applied. */
2916 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2917 for (graph_edge *pred = slpg->vertices[idx].pred;
2918 pred; pred = pred->pred_next)
2919 {
2920 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2921 int pred_perm = n_perm[pred->src];
2922 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2923 {
2924 all_preds_permuted = false;
2925 break;
2926 }
2927 }
2928 if (!all_preds_permuted)
2929 {
2930 if (!bitmap_bit_p (n_materialize, idx))
2931 changed = true;
2932 bitmap_set_bit (n_materialize, idx);
2933 }
2934 }
2935 }
2936 while (changed || iteration == 1);
2937
2938 /* Materialize. */
2939 for (i = 0; i < vertices.length (); ++i)
2940 {
2941 int perm = n_perm[i];
2942 if (perm <= 0)
2943 continue;
2944
2945 slp_tree node = vertices[i];
2946
2947 /* First permute invariant/external original successors. */
2948 unsigned j;
2949 slp_tree child;
2950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2951 {
2952 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2953 continue;
2954
2955 /* If the vector is uniform there's nothing to do. */
2956 if (vect_slp_tree_uniform_p (child))
2957 continue;
2958
2959 /* We can end up sharing some externals via two_operator
2960 handling. Be prepared to unshare those. */
2961 if (child->refcnt != 1)
2962 {
2963 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2964 SLP_TREE_CHILDREN (node)[j] = child
2965 = vect_create_new_slp_node
2966 (SLP_TREE_SCALAR_OPS (child).copy ());
2967 }
2968 vect_slp_permute (perms[perm],
2969 SLP_TREE_SCALAR_OPS (child), true);
2970 }
2971
2972 if (bitmap_bit_p (n_materialize, i))
2973 {
2974 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2975 /* For loads simply drop the permutation, the load permutation
2976 already performs the desired permutation. */
2977 ;
2978 else
2979 {
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "inserting permute node in place of %p\n",
2983 node);
2984
2985 /* Make a copy of NODE and in-place change it to a
2986 VEC_PERM node to permute the lanes of the copy. */
2987 slp_tree copy = new _slp_tree;
2988 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
2989 SLP_TREE_CHILDREN (node) = vNULL;
2990 SLP_TREE_SCALAR_STMTS (copy)
2991 = SLP_TREE_SCALAR_STMTS (node).copy ();
2992 vect_slp_permute (perms[perm],
2993 SLP_TREE_SCALAR_STMTS (copy), true);
2994 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
2995 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
2996 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
2997 SLP_TREE_LANE_PERMUTATION (copy)
2998 = SLP_TREE_LANE_PERMUTATION (node);
2999 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3000 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3001 copy->refcnt = 1;
3002 copy->max_nunits = node->max_nunits;
3003 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3004 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3005 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3006
3007 /* Now turn NODE into a VEC_PERM. */
3008 SLP_TREE_CHILDREN (node).safe_push (copy);
3009 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3010 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3011 SLP_TREE_LANE_PERMUTATION (node)
3012 .quick_push (std::make_pair (0, perms[perm][j]));
3013 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3014 }
3015 }
3016 else
3017 {
3018 /* Apply the reverse permutation to our stmts. */
3019 vect_slp_permute (perms[perm],
3020 SLP_TREE_SCALAR_STMTS (node), true);
3021 /* And to the load permutation, which we can simply
3022 make regular by design. */
3023 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3024 {
3025 /* ??? When we handle non-bijective permutes the idea
3026 is that we can force the load-permutation to be
3027 { min, min + 1, min + 2, ... max }. But then the
3028 scalar defs might no longer match the lane content
3029 which means wrong-code with live lane vectorization.
3030 So we possibly have to have NULL entries for those. */
3031 vect_slp_permute (perms[perm],
3032 SLP_TREE_LOAD_PERMUTATION (node), true);
3033 }
3034 }
3035 }
3036
3037 /* Free the perms vector used for propagation. */
3038 while (!perms.is_empty ())
3039 perms.pop ().release ();
3040 free_graph (slpg);
3041
3042
3043 /* Now elide load permutations that are not necessary. */
3044 for (i = 0; i < leafs.length (); ++i)
3045 {
3046 node = vertices[leafs[i]];
3047 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3048 continue;
3049
3050 /* In basic block vectorization we allow any subchain of an interleaving
3051 chain.
3052 FORNOW: not in loop SLP because of realignment complications. */
3053 if (is_a <bb_vec_info> (vinfo))
3054 {
3055 bool subchain_p = true;
3056 stmt_vec_info next_load_info = NULL;
3057 stmt_vec_info load_info;
3058 unsigned j;
3059 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3060 {
3061 if (j != 0
3062 && (next_load_info != load_info
3063 || DR_GROUP_GAP (load_info) != 1))
3064 {
3065 subchain_p = false;
3066 break;
3067 }
3068 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3069 }
3070 if (subchain_p)
3071 {
3072 SLP_TREE_LOAD_PERMUTATION (node).release ();
3073 continue;
3074 }
3075 }
3076 else
3077 {
3078 stmt_vec_info load_info;
3079 bool this_load_permuted = false;
3080 unsigned j;
3081 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3082 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3083 {
3084 this_load_permuted = true;
3085 break;
3086 }
3087 stmt_vec_info first_stmt_info
3088 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3089 if (!this_load_permuted
3090 /* The load requires permutation when unrolling exposes
3091 a gap either because the group is larger than the SLP
3092 group-size or because there is a gap between the groups. */
3093 && (known_eq (LOOP_VINFO_VECT_FACTOR
3094 (as_a <loop_vec_info> (vinfo)), 1U)
3095 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3096 && DR_GROUP_GAP (first_stmt_info) == 0)))
3097 {
3098 SLP_TREE_LOAD_PERMUTATION (node).release ();
3099 continue;
3100 }
3101 }
3102 }
3103 }
3104
3105
3106 /* For each possible SLP instance decide whether to SLP it and calculate overall
3107 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3108 least one instance. */
3109
3110 bool
3111 vect_make_slp_decision (loop_vec_info loop_vinfo)
3112 {
3113 unsigned int i;
3114 poly_uint64 unrolling_factor = 1;
3115 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3116 slp_instance instance;
3117 int decided_to_slp = 0;
3118
3119 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3120
3121 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3122 {
3123 /* FORNOW: SLP if you can. */
3124 /* All unroll factors have the form:
3125
3126 GET_MODE_SIZE (vinfo->vector_mode) * X
3127
3128 for some rational X, so they must have a common multiple. */
3129 unrolling_factor
3130 = force_common_multiple (unrolling_factor,
3131 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3132
3133 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3134 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3135 loop-based vectorization. Such stmts will be marked as HYBRID. */
3136 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3137 decided_to_slp++;
3138 }
3139
3140 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3141
3142 if (decided_to_slp && dump_enabled_p ())
3143 {
3144 dump_printf_loc (MSG_NOTE, vect_location,
3145 "Decided to SLP %d instances. Unrolling factor ",
3146 decided_to_slp);
3147 dump_dec (MSG_NOTE, unrolling_factor);
3148 dump_printf (MSG_NOTE, "\n");
3149 }
3150
3151 return (decided_to_slp > 0);
3152 }
3153
3154 /* Private data for vect_detect_hybrid_slp. */
3155 struct vdhs_data
3156 {
3157 loop_vec_info loop_vinfo;
3158 vec<stmt_vec_info> *worklist;
3159 };
3160
3161 /* Walker for walk_gimple_op. */
3162
3163 static tree
3164 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3165 {
3166 walk_stmt_info *wi = (walk_stmt_info *)data;
3167 vdhs_data *dat = (vdhs_data *)wi->info;
3168
3169 if (wi->is_lhs)
3170 return NULL_TREE;
3171
3172 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3173 if (!def_stmt_info)
3174 return NULL_TREE;
3175 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3176 if (PURE_SLP_STMT (def_stmt_info))
3177 {
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3180 def_stmt_info->stmt);
3181 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3182 dat->worklist->safe_push (def_stmt_info);
3183 }
3184
3185 return NULL_TREE;
3186 }
3187
3188 /* Find stmts that must be both vectorized and SLPed. */
3189
3190 void
3191 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3192 {
3193 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3194
3195 /* All stmts participating in SLP are marked pure_slp, all other
3196 stmts are loop_vect.
3197 First collect all loop_vect stmts into a worklist. */
3198 auto_vec<stmt_vec_info> worklist;
3199 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3200 {
3201 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3202 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3203 gsi_next (&gsi))
3204 {
3205 gphi *phi = gsi.phi ();
3206 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3207 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3208 worklist.safe_push (stmt_info);
3209 }
3210 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3211 gsi_next (&gsi))
3212 {
3213 gimple *stmt = gsi_stmt (gsi);
3214 if (is_gimple_debug (stmt))
3215 continue;
3216 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3217 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3218 {
3219 for (gimple_stmt_iterator gsi2
3220 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3221 !gsi_end_p (gsi2); gsi_next (&gsi2))
3222 {
3223 stmt_vec_info patt_info
3224 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3225 if (!STMT_SLP_TYPE (patt_info)
3226 && STMT_VINFO_RELEVANT (patt_info))
3227 worklist.safe_push (patt_info);
3228 }
3229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3230 }
3231 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3232 worklist.safe_push (stmt_info);
3233 }
3234 }
3235
3236 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3237 mark any SLP vectorized stmt as hybrid.
3238 ??? We're visiting def stmts N times (once for each non-SLP and
3239 once for each hybrid-SLP use). */
3240 walk_stmt_info wi;
3241 vdhs_data dat;
3242 dat.worklist = &worklist;
3243 dat.loop_vinfo = loop_vinfo;
3244 memset (&wi, 0, sizeof (wi));
3245 wi.info = (void *)&dat;
3246 while (!worklist.is_empty ())
3247 {
3248 stmt_vec_info stmt_info = worklist.pop ();
3249 /* Since SSA operands are not set up for pattern stmts we need
3250 to use walk_gimple_op. */
3251 wi.is_lhs = 0;
3252 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3253 }
3254 }
3255
3256
3257 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3258
3259 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3260 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3261 {
3262 for (unsigned i = 0; i < bbs.length (); ++i)
3263 {
3264 if (i != 0)
3265 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3266 gsi_next (&si))
3267 {
3268 gphi *phi = si.phi ();
3269 gimple_set_uid (phi, 0);
3270 add_stmt (phi);
3271 }
3272 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3273 !gsi_end_p (gsi); gsi_next (&gsi))
3274 {
3275 gimple *stmt = gsi_stmt (gsi);
3276 gimple_set_uid (stmt, 0);
3277 if (is_gimple_debug (stmt))
3278 continue;
3279 add_stmt (stmt);
3280 }
3281 }
3282 }
3283
3284
3285 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3286 stmts in the basic block. */
3287
3288 _bb_vec_info::~_bb_vec_info ()
3289 {
3290 /* Reset region marker. */
3291 for (unsigned i = 0; i < bbs.length (); ++i)
3292 {
3293 if (i != 0)
3294 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3295 gsi_next (&si))
3296 {
3297 gphi *phi = si.phi ();
3298 gimple_set_uid (phi, -1);
3299 }
3300 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3301 !gsi_end_p (gsi); gsi_next (&gsi))
3302 {
3303 gimple *stmt = gsi_stmt (gsi);
3304 gimple_set_uid (stmt, -1);
3305 }
3306 }
3307 }
3308
3309 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3310 given then that child nodes have already been processed, and that
3311 their def types currently match their SLP node's def type. */
3312
3313 static bool
3314 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3315 slp_instance node_instance,
3316 stmt_vector_for_cost *cost_vec)
3317 {
3318 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3319 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3320
3321 /* Calculate the number of vector statements to be created for the
3322 scalar stmts in this node. For SLP reductions it is equal to the
3323 number of vector statements in the children (which has already been
3324 calculated by the recursive call). Otherwise it is the number of
3325 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3326 VF divided by the number of elements in a vector. */
3327 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3328 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3329 {
3330 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3331 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3332 {
3333 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3334 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3335 break;
3336 }
3337 }
3338 else
3339 {
3340 poly_uint64 vf;
3341 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3342 vf = loop_vinfo->vectorization_factor;
3343 else
3344 vf = 1;
3345 unsigned int group_size = SLP_TREE_LANES (node);
3346 tree vectype = SLP_TREE_VECTYPE (node);
3347 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3348 = vect_get_num_vectors (vf * group_size, vectype);
3349 }
3350
3351 /* Handle purely internal nodes. */
3352 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3353 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3354
3355 if (is_a <bb_vec_info> (vinfo)
3356 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3357 return false;
3358
3359 bool dummy;
3360 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3361 node, node_instance, cost_vec);
3362 }
3363
3364 /* Try to build NODE from scalars, returning true on success.
3365 NODE_INSTANCE is the SLP instance that contains NODE. */
3366
3367 static bool
3368 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3369 slp_instance node_instance)
3370 {
3371 stmt_vec_info stmt_info;
3372 unsigned int i;
3373
3374 if (!is_a <bb_vec_info> (vinfo)
3375 || node == SLP_INSTANCE_TREE (node_instance)
3376 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3377 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3378 return false;
3379
3380 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_NOTE, vect_location,
3382 "Building vector operands from scalars instead\n");
3383
3384 /* Don't remove and free the child nodes here, since they could be
3385 referenced by other structures. The analysis and scheduling phases
3386 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3387 unsigned int group_size = SLP_TREE_LANES (node);
3388 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3389 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3390 SLP_TREE_LOAD_PERMUTATION (node).release ();
3391 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3392 {
3393 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3394 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3395 }
3396 return true;
3397 }
3398
3399 /* Compute the prologue cost for invariant or constant operands represented
3400 by NODE. */
3401
3402 static void
3403 vect_prologue_cost_for_slp (slp_tree node,
3404 stmt_vector_for_cost *cost_vec)
3405 {
3406 /* There's a special case of an existing vector, that costs nothing. */
3407 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3408 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3409 return;
3410 /* Without looking at the actual initializer a vector of
3411 constants can be implemented as load from the constant pool.
3412 When all elements are the same we can use a splat. */
3413 tree vectype = SLP_TREE_VECTYPE (node);
3414 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3415 unsigned num_vects_to_check;
3416 unsigned HOST_WIDE_INT const_nunits;
3417 unsigned nelt_limit;
3418 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3419 && ! multiple_p (const_nunits, group_size))
3420 {
3421 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3422 nelt_limit = const_nunits;
3423 }
3424 else
3425 {
3426 /* If either the vector has variable length or the vectors
3427 are composed of repeated whole groups we only need to
3428 cost construction once. All vectors will be the same. */
3429 num_vects_to_check = 1;
3430 nelt_limit = group_size;
3431 }
3432 tree elt = NULL_TREE;
3433 unsigned nelt = 0;
3434 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3435 {
3436 unsigned si = j % group_size;
3437 if (nelt == 0)
3438 elt = SLP_TREE_SCALAR_OPS (node)[si];
3439 /* ??? We're just tracking whether all operands of a single
3440 vector initializer are the same, ideally we'd check if
3441 we emitted the same one already. */
3442 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3443 elt = NULL_TREE;
3444 nelt++;
3445 if (nelt == nelt_limit)
3446 {
3447 record_stmt_cost (cost_vec, 1,
3448 SLP_TREE_DEF_TYPE (node) == vect_external_def
3449 ? (elt ? scalar_to_vec : vec_construct)
3450 : vector_load,
3451 NULL, vectype, 0, vect_prologue);
3452 nelt = 0;
3453 }
3454 }
3455 }
3456
3457 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3458 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3459
3460 Return true if the operations are supported. */
3461
3462 static bool
3463 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3464 slp_instance node_instance,
3465 hash_set<slp_tree> &visited,
3466 hash_set<slp_tree> &lvisited,
3467 stmt_vector_for_cost *cost_vec)
3468 {
3469 int i, j;
3470 slp_tree child;
3471
3472 /* Assume we can code-generate all invariants. */
3473 if (!node
3474 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3475 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3476 return true;
3477
3478 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3479 {
3480 if (dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE, vect_location,
3482 "Failed cyclic SLP reference in %p", node);
3483 return false;
3484 }
3485 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3486
3487 /* If we already analyzed the exact same set of scalar stmts we're done.
3488 We share the generated vector stmts for those. */
3489 if (visited.contains (node)
3490 || lvisited.add (node))
3491 return true;
3492
3493 bool res = true;
3494 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3495 {
3496 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3497 visited, lvisited, cost_vec);
3498 if (!res)
3499 break;
3500 }
3501
3502 if (res)
3503 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3504 cost_vec);
3505 if (!res)
3506 lvisited.remove (node);
3507
3508 /* When the node can be vectorized cost invariant nodes it references.
3509 This is not done in DFS order to allow the refering node
3510 vectorizable_* calls to nail down the invariant nodes vector type
3511 and possibly unshare it if it needs a different vector type than
3512 other referrers. */
3513 if (res)
3514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3515 if (child
3516 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3517 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3518 /* Perform usual caching, note code-generation still
3519 code-gens these nodes multiple times but we expect
3520 to CSE them later. */
3521 && !visited.contains (child)
3522 && !lvisited.add (child))
3523 {
3524 /* ??? After auditing more code paths make a "default"
3525 and push the vector type from NODE to all children
3526 if it is not already set. */
3527 /* Compute the number of vectors to be generated. */
3528 tree vector_type = SLP_TREE_VECTYPE (child);
3529 if (!vector_type)
3530 {
3531 /* For shifts with a scalar argument we don't need
3532 to cost or code-generate anything.
3533 ??? Represent this more explicitely. */
3534 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3535 == shift_vec_info_type)
3536 && j == 1);
3537 continue;
3538 }
3539 unsigned group_size = SLP_TREE_LANES (child);
3540 poly_uint64 vf = 1;
3541 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3542 vf = loop_vinfo->vectorization_factor;
3543 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3544 = vect_get_num_vectors (vf * group_size, vector_type);
3545 /* And cost them. */
3546 vect_prologue_cost_for_slp (child, cost_vec);
3547 }
3548
3549 /* If this node or any of its children can't be vectorized, try pruning
3550 the tree here rather than felling the whole thing. */
3551 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3552 {
3553 /* We'll need to revisit this for invariant costing and number
3554 of vectorized stmt setting. */
3555 res = true;
3556 }
3557
3558 return res;
3559 }
3560
3561
3562 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3563 region and that can be vectorized using vectorizable_live_operation
3564 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3565 scalar code computing it to be retained. */
3566
3567 static void
3568 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3569 slp_instance instance,
3570 stmt_vector_for_cost *cost_vec,
3571 hash_set<stmt_vec_info> &svisited,
3572 hash_set<slp_tree> &visited)
3573 {
3574 if (visited.add (node))
3575 return;
3576
3577 unsigned i;
3578 stmt_vec_info stmt_info;
3579 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3580 bool all_visited = true;
3581 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3582 {
3583 if (svisited.contains (stmt_info))
3584 continue;
3585 all_visited = false;
3586 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3587 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3588 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3589 /* Only the pattern root stmt computes the original scalar value. */
3590 continue;
3591 bool mark_visited = true;
3592 gimple *orig_stmt = orig_stmt_info->stmt;
3593 ssa_op_iter op_iter;
3594 def_operand_p def_p;
3595 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3596 {
3597 imm_use_iterator use_iter;
3598 gimple *use_stmt;
3599 stmt_vec_info use_stmt_info;
3600 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3601 if (!is_gimple_debug (use_stmt))
3602 {
3603 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3604 if (!use_stmt_info
3605 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3606 {
3607 STMT_VINFO_LIVE_P (stmt_info) = true;
3608 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3609 NULL, node, instance, i,
3610 false, cost_vec))
3611 /* ??? So we know we can vectorize the live stmt
3612 from one SLP node. If we cannot do so from all
3613 or none consistently we'd have to record which
3614 SLP node (and lane) we want to use for the live
3615 operation. So make sure we can code-generate
3616 from all nodes. */
3617 mark_visited = false;
3618 else
3619 STMT_VINFO_LIVE_P (stmt_info) = false;
3620 BREAK_FROM_IMM_USE_STMT (use_iter);
3621 }
3622 }
3623 /* We have to verify whether we can insert the lane extract
3624 before all uses. The following is a conservative approximation.
3625 We cannot put this into vectorizable_live_operation because
3626 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3627 doesn't work.
3628 Note that while the fact that we emit code for loads at the
3629 first load should make this a non-problem leafs we construct
3630 from scalars are vectorized after the last scalar def.
3631 ??? If we'd actually compute the insert location during
3632 analysis we could use sth less conservative than the last
3633 scalar stmt in the node for the dominance check. */
3634 /* ??? What remains is "live" uses in vector CTORs in the same
3635 SLP graph which is where those uses can end up code-generated
3636 right after their definition instead of close to their original
3637 use. But that would restrict us to code-generate lane-extracts
3638 from the latest stmt in a node. So we compensate for this
3639 during code-generation, simply not replacing uses for those
3640 hopefully rare cases. */
3641 if (STMT_VINFO_LIVE_P (stmt_info))
3642 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3643 if (!is_gimple_debug (use_stmt)
3644 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3645 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3646 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3647 {
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650 "Cannot determine insertion place for "
3651 "lane extract\n");
3652 STMT_VINFO_LIVE_P (stmt_info) = false;
3653 mark_visited = true;
3654 }
3655 }
3656 if (mark_visited)
3657 svisited.add (stmt_info);
3658 }
3659 if (all_visited)
3660 return;
3661
3662 slp_tree child;
3663 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3664 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3665 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3666 cost_vec, svisited, visited);
3667 }
3668
3669 /* Analyze statements in SLP instances of VINFO. Return true if the
3670 operations are supported. */
3671
3672 bool
3673 vect_slp_analyze_operations (vec_info *vinfo)
3674 {
3675 slp_instance instance;
3676 int i;
3677
3678 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3679
3680 hash_set<slp_tree> visited;
3681 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3682 {
3683 hash_set<slp_tree> lvisited;
3684 stmt_vector_for_cost cost_vec;
3685 cost_vec.create (2);
3686 if (is_a <bb_vec_info> (vinfo))
3687 vect_location = instance->location ();
3688 if (!vect_slp_analyze_node_operations (vinfo,
3689 SLP_INSTANCE_TREE (instance),
3690 instance, visited, lvisited,
3691 &cost_vec)
3692 /* Instances with a root stmt require vectorized defs for the
3693 SLP tree root. */
3694 || (SLP_INSTANCE_ROOT_STMT (instance)
3695 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3696 != vect_internal_def)))
3697 {
3698 slp_tree node = SLP_INSTANCE_TREE (instance);
3699 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE, vect_location,
3702 "removing SLP instance operations starting from: %G",
3703 stmt_info->stmt);
3704 vect_free_slp_instance (instance);
3705 vinfo->slp_instances.ordered_remove (i);
3706 cost_vec.release ();
3707 }
3708 else
3709 {
3710 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3711 x != lvisited.end(); ++x)
3712 visited.add (*x);
3713 i++;
3714
3715 /* For BB vectorization remember the SLP graph entry
3716 cost for later. */
3717 if (is_a <bb_vec_info> (vinfo))
3718 instance->cost_vec = cost_vec;
3719 else
3720 {
3721 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3722 cost_vec.release ();
3723 }
3724 }
3725 }
3726
3727 /* Compute vectorizable live stmts. */
3728 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3729 {
3730 hash_set<stmt_vec_info> svisited;
3731 hash_set<slp_tree> visited;
3732 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3733 {
3734 vect_location = instance->location ();
3735 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3736 instance, &instance->cost_vec, svisited,
3737 visited);
3738 }
3739 }
3740
3741 return !vinfo->slp_instances.is_empty ();
3742 }
3743
3744 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3745 closing the eventual chain. */
3746
3747 static slp_instance
3748 get_ultimate_leader (slp_instance instance,
3749 hash_map<slp_instance, slp_instance> &instance_leader)
3750 {
3751 auto_vec<slp_instance *, 8> chain;
3752 slp_instance *tem;
3753 while (*(tem = instance_leader.get (instance)) != instance)
3754 {
3755 chain.safe_push (tem);
3756 instance = *tem;
3757 }
3758 while (!chain.is_empty ())
3759 *chain.pop () = instance;
3760 return instance;
3761 }
3762
3763 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3764
3765 static void
3766 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3767 slp_instance instance, slp_tree node,
3768 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3769 hash_map<slp_instance, slp_instance> &instance_leader,
3770 hash_set<slp_tree> &visited)
3771 {
3772 stmt_vec_info stmt_info;
3773 unsigned i;
3774
3775 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3776 {
3777 bool existed_p;
3778 slp_instance &stmt_instance
3779 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3780 if (!existed_p)
3781 ;
3782 else if (stmt_instance != instance)
3783 {
3784 /* If we're running into a previously marked stmt make us the
3785 leader of the current ultimate leader. This keeps the
3786 leader chain acyclic and works even when the current instance
3787 connects two previously independent graph parts. */
3788 slp_instance stmt_leader
3789 = get_ultimate_leader (stmt_instance, instance_leader);
3790 if (stmt_leader != instance)
3791 instance_leader.put (stmt_leader, instance);
3792 }
3793 stmt_instance = instance;
3794 }
3795
3796 if (visited.add (node))
3797 return;
3798
3799 slp_tree child;
3800 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3801 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3802 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3803 instance_leader, visited);
3804 }
3805
3806 /* Partition the SLP graph into pieces that can be costed independently. */
3807
3808 static void
3809 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3810 {
3811 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3812
3813 /* First walk the SLP graph assigning each involved scalar stmt a
3814 corresponding SLP graph entry and upon visiting a previously
3815 marked stmt, make the stmts leader the current SLP graph entry. */
3816 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3817 hash_map<slp_instance, slp_instance> instance_leader;
3818 hash_set<slp_tree> visited;
3819 slp_instance instance;
3820 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3821 {
3822 instance_leader.put (instance, instance);
3823 vect_bb_partition_graph_r (bb_vinfo,
3824 instance, SLP_INSTANCE_TREE (instance),
3825 stmt_to_instance, instance_leader,
3826 visited);
3827 }
3828
3829 /* Then collect entries to each independent subgraph. */
3830 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3831 {
3832 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3833 leader->subgraph_entries.safe_push (instance);
3834 if (dump_enabled_p ()
3835 && leader != instance)
3836 dump_printf_loc (MSG_NOTE, vect_location,
3837 "instance %p is leader of %p\n",
3838 leader, instance);
3839 }
3840 }
3841
3842 /* Compute the scalar cost of the SLP node NODE and its children
3843 and return it. Do not account defs that are marked in LIFE and
3844 update LIFE according to uses of NODE. */
3845
3846 static void
3847 vect_bb_slp_scalar_cost (vec_info *vinfo,
3848 slp_tree node, vec<bool, va_heap> *life,
3849 stmt_vector_for_cost *cost_vec,
3850 hash_set<slp_tree> &visited)
3851 {
3852 unsigned i;
3853 stmt_vec_info stmt_info;
3854 slp_tree child;
3855
3856 if (visited.add (node))
3857 return;
3858
3859 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3860 {
3861 ssa_op_iter op_iter;
3862 def_operand_p def_p;
3863
3864 if ((*life)[i])
3865 continue;
3866
3867 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3868 gimple *orig_stmt = orig_stmt_info->stmt;
3869
3870 /* If there is a non-vectorized use of the defs then the scalar
3871 stmt is kept live in which case we do not account it or any
3872 required defs in the SLP children in the scalar cost. This
3873 way we make the vectorization more costly when compared to
3874 the scalar cost. */
3875 if (!STMT_VINFO_LIVE_P (stmt_info))
3876 {
3877 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3878 {
3879 imm_use_iterator use_iter;
3880 gimple *use_stmt;
3881 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3882 if (!is_gimple_debug (use_stmt))
3883 {
3884 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3885 if (!use_stmt_info
3886 || !PURE_SLP_STMT
3887 (vect_stmt_to_vectorize (use_stmt_info)))
3888 {
3889 (*life)[i] = true;
3890 BREAK_FROM_IMM_USE_STMT (use_iter);
3891 }
3892 }
3893 }
3894 if ((*life)[i])
3895 continue;
3896 }
3897
3898 /* Count scalar stmts only once. */
3899 if (gimple_visited_p (orig_stmt))
3900 continue;
3901 gimple_set_visited (orig_stmt, true);
3902
3903 vect_cost_for_stmt kind;
3904 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3905 {
3906 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3907 kind = scalar_load;
3908 else
3909 kind = scalar_store;
3910 }
3911 else if (vect_nop_conversion_p (orig_stmt_info))
3912 continue;
3913 else
3914 kind = scalar_stmt;
3915 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3916 }
3917
3918 auto_vec<bool, 20> subtree_life;
3919 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3920 {
3921 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3922 {
3923 /* Do not directly pass LIFE to the recursive call, copy it to
3924 confine changes in the callee to the current child/subtree. */
3925 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3926 {
3927 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3928 for (unsigned j = 0;
3929 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3930 {
3931 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3932 if (perm.first == i)
3933 subtree_life[perm.second] = (*life)[j];
3934 }
3935 }
3936 else
3937 {
3938 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3939 subtree_life.safe_splice (*life);
3940 }
3941 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3942 visited);
3943 subtree_life.truncate (0);
3944 }
3945 }
3946 }
3947
3948 /* Check if vectorization of the basic block is profitable for the
3949 subgraph denoted by SLP_INSTANCES. */
3950
3951 static bool
3952 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3953 vec<slp_instance> slp_instances)
3954 {
3955 slp_instance instance;
3956 int i;
3957 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3958 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3959
3960 void *vect_target_cost_data = init_cost (NULL);
3961
3962 /* Calculate scalar cost and sum the cost for the vector stmts
3963 previously collected. */
3964 stmt_vector_for_cost scalar_costs;
3965 scalar_costs.create (0);
3966 hash_set<slp_tree> visited;
3967 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3968 {
3969 auto_vec<bool, 20> life;
3970 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3971 true);
3972 vect_bb_slp_scalar_cost (bb_vinfo,
3973 SLP_INSTANCE_TREE (instance),
3974 &life, &scalar_costs, visited);
3975 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3976 instance->cost_vec.release ();
3977 }
3978 /* Unset visited flag. */
3979 stmt_info_for_cost *si;
3980 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3981 gimple_set_visited (si->stmt_info->stmt, false);
3982
3983 void *scalar_target_cost_data = init_cost (NULL);
3984 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3985 scalar_costs.release ();
3986 unsigned dummy;
3987 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3988 destroy_cost_data (scalar_target_cost_data);
3989
3990 /* Complete the target-specific vector cost calculation. */
3991 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3992 &vec_inside_cost, &vec_epilogue_cost);
3993 destroy_cost_data (vect_target_cost_data);
3994
3995 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3996
3997 if (dump_enabled_p ())
3998 {
3999 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4000 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4001 vec_inside_cost);
4002 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4003 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4004 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4005 }
4006
4007 /* Vectorization is profitable if its cost is more than the cost of scalar
4008 version. Note that we err on the vector side for equal cost because
4009 the cost estimate is otherwise quite pessimistic (constant uses are
4010 free on the scalar side but cost a load on the vector side for
4011 example). */
4012 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4013 return false;
4014
4015 return true;
4016 }
4017
4018 /* Find any vectorizable constructors and add them to the grouped_store
4019 array. */
4020
4021 static void
4022 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4023 {
4024 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4025 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4026 !gsi_end_p (gsi); gsi_next (&gsi))
4027 {
4028 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4029 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4030 continue;
4031
4032 tree rhs = gimple_assign_rhs1 (assign);
4033 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4034 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4035 CONSTRUCTOR_NELTS (rhs))
4036 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4037 || uniform_vector_p (rhs))
4038 continue;
4039
4040 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4041 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4042 }
4043 }
4044
4045 /* Check if the region described by BB_VINFO can be vectorized, returning
4046 true if so. When returning false, set FATAL to true if the same failure
4047 would prevent vectorization at other vector sizes, false if it is still
4048 worth trying other sizes. N_STMTS is the number of statements in the
4049 region. */
4050
4051 static bool
4052 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4053 vec<int> *dataref_groups)
4054 {
4055 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4056
4057 slp_instance instance;
4058 int i;
4059 poly_uint64 min_vf = 2;
4060
4061 /* The first group of checks is independent of the vector size. */
4062 fatal = true;
4063
4064 /* Analyze the data references. */
4065
4066 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4067 {
4068 if (dump_enabled_p ())
4069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4070 "not vectorized: unhandled data-ref in basic "
4071 "block.\n");
4072 return false;
4073 }
4074
4075 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4076 {
4077 if (dump_enabled_p ())
4078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4079 "not vectorized: unhandled data access in "
4080 "basic block.\n");
4081 return false;
4082 }
4083
4084 vect_slp_check_for_constructors (bb_vinfo);
4085
4086 /* If there are no grouped stores and no constructors in the region
4087 there is no need to continue with pattern recog as vect_analyze_slp
4088 will fail anyway. */
4089 if (bb_vinfo->grouped_stores.is_empty ())
4090 {
4091 if (dump_enabled_p ())
4092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4093 "not vectorized: no grouped stores in "
4094 "basic block.\n");
4095 return false;
4096 }
4097
4098 /* While the rest of the analysis below depends on it in some way. */
4099 fatal = false;
4100
4101 vect_pattern_recog (bb_vinfo);
4102
4103 /* Check the SLP opportunities in the basic block, analyze and build SLP
4104 trees. */
4105 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4106 {
4107 if (dump_enabled_p ())
4108 {
4109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4110 "Failed to SLP the basic block.\n");
4111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4112 "not vectorized: failed to find SLP opportunities "
4113 "in basic block.\n");
4114 }
4115 return false;
4116 }
4117
4118 /* Optimize permutations. */
4119 vect_optimize_slp (bb_vinfo);
4120
4121 vect_record_base_alignments (bb_vinfo);
4122
4123 /* Analyze and verify the alignment of data references and the
4124 dependence in the SLP instances. */
4125 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4126 {
4127 vect_location = instance->location ();
4128 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4129 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4130 {
4131 slp_tree node = SLP_INSTANCE_TREE (instance);
4132 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4133 if (dump_enabled_p ())
4134 dump_printf_loc (MSG_NOTE, vect_location,
4135 "removing SLP instance operations starting from: %G",
4136 stmt_info->stmt);
4137 vect_free_slp_instance (instance);
4138 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4139 continue;
4140 }
4141
4142 /* Mark all the statements that we want to vectorize as pure SLP and
4143 relevant. */
4144 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4145 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4146 if (SLP_INSTANCE_ROOT_STMT (instance))
4147 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4148
4149 i++;
4150 }
4151 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4152 return false;
4153
4154 if (!vect_slp_analyze_operations (bb_vinfo))
4155 {
4156 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4158 "not vectorized: bad operation in basic block.\n");
4159 return false;
4160 }
4161
4162 vect_bb_partition_graph (bb_vinfo);
4163
4164 return true;
4165 }
4166
4167 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4168 basic blocks in BBS, returning true on success.
4169 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4170
4171 static bool
4172 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4173 vec<int> *dataref_groups, unsigned int n_stmts)
4174 {
4175 bb_vec_info bb_vinfo;
4176 auto_vector_modes vector_modes;
4177
4178 /* Autodetect first vector size we try. */
4179 machine_mode next_vector_mode = VOIDmode;
4180 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4181 unsigned int mode_i = 0;
4182
4183 vec_info_shared shared;
4184
4185 machine_mode autodetected_vector_mode = VOIDmode;
4186 while (1)
4187 {
4188 bool vectorized = false;
4189 bool fatal = false;
4190 bb_vinfo = new _bb_vec_info (bbs, &shared);
4191
4192 bool first_time_p = shared.datarefs.is_empty ();
4193 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4194 if (first_time_p)
4195 bb_vinfo->shared->save_datarefs ();
4196 else
4197 bb_vinfo->shared->check_datarefs ();
4198 bb_vinfo->vector_mode = next_vector_mode;
4199
4200 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4201 && dbg_cnt (vect_slp))
4202 {
4203 if (dump_enabled_p ())
4204 {
4205 dump_printf_loc (MSG_NOTE, vect_location,
4206 "***** Analysis succeeded with vector mode"
4207 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4208 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4209 }
4210
4211 bb_vinfo->shared->check_datarefs ();
4212
4213 unsigned i;
4214 slp_instance instance;
4215 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4216 {
4217 if (instance->subgraph_entries.is_empty ())
4218 continue;
4219
4220 vect_location = instance->location ();
4221 if (!unlimited_cost_model (NULL)
4222 && !vect_bb_vectorization_profitable_p
4223 (bb_vinfo, instance->subgraph_entries))
4224 {
4225 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227 "not vectorized: vectorization is not "
4228 "profitable.\n");
4229 continue;
4230 }
4231
4232 if (!vectorized && dump_enabled_p ())
4233 dump_printf_loc (MSG_NOTE, vect_location,
4234 "Basic block will be vectorized "
4235 "using SLP\n");
4236 vectorized = true;
4237
4238 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4239
4240 unsigned HOST_WIDE_INT bytes;
4241 if (dump_enabled_p ())
4242 {
4243 if (GET_MODE_SIZE
4244 (bb_vinfo->vector_mode).is_constant (&bytes))
4245 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4246 "basic block part vectorized using %wu "
4247 "byte vectors\n", bytes);
4248 else
4249 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4250 "basic block part vectorized using "
4251 "variable length vectors\n");
4252 }
4253 }
4254 }
4255 else
4256 {
4257 if (dump_enabled_p ())
4258 dump_printf_loc (MSG_NOTE, vect_location,
4259 "***** Analysis failed with vector mode %s\n",
4260 GET_MODE_NAME (bb_vinfo->vector_mode));
4261 }
4262
4263 if (mode_i == 0)
4264 autodetected_vector_mode = bb_vinfo->vector_mode;
4265
4266 if (!fatal)
4267 while (mode_i < vector_modes.length ()
4268 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4269 {
4270 if (dump_enabled_p ())
4271 dump_printf_loc (MSG_NOTE, vect_location,
4272 "***** The result for vector mode %s would"
4273 " be the same\n",
4274 GET_MODE_NAME (vector_modes[mode_i]));
4275 mode_i += 1;
4276 }
4277
4278 delete bb_vinfo;
4279
4280 if (mode_i < vector_modes.length ()
4281 && VECTOR_MODE_P (autodetected_vector_mode)
4282 && (related_vector_mode (vector_modes[mode_i],
4283 GET_MODE_INNER (autodetected_vector_mode))
4284 == autodetected_vector_mode)
4285 && (related_vector_mode (autodetected_vector_mode,
4286 GET_MODE_INNER (vector_modes[mode_i]))
4287 == vector_modes[mode_i]))
4288 {
4289 if (dump_enabled_p ())
4290 dump_printf_loc (MSG_NOTE, vect_location,
4291 "***** Skipping vector mode %s, which would"
4292 " repeat the analysis for %s\n",
4293 GET_MODE_NAME (vector_modes[mode_i]),
4294 GET_MODE_NAME (autodetected_vector_mode));
4295 mode_i += 1;
4296 }
4297
4298 if (vectorized
4299 || mode_i == vector_modes.length ()
4300 || autodetected_vector_mode == VOIDmode
4301 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4302 vector sizes will fail do not bother iterating. */
4303 || fatal)
4304 return vectorized;
4305
4306 /* Try the next biggest vector size. */
4307 next_vector_mode = vector_modes[mode_i++];
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_NOTE, vect_location,
4310 "***** Re-trying analysis with vector mode %s\n",
4311 GET_MODE_NAME (next_vector_mode));
4312 }
4313 }
4314
4315
4316 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4317 true if anything in the basic-block was vectorized. */
4318
4319 static bool
4320 vect_slp_bbs (vec<basic_block> bbs)
4321 {
4322 vec<data_reference_p> datarefs = vNULL;
4323 auto_vec<int> dataref_groups;
4324 int insns = 0;
4325 int current_group = 0;
4326
4327 for (unsigned i = 0; i < bbs.length (); i++)
4328 {
4329 basic_block bb = bbs[i];
4330 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4331 gsi_next (&gsi))
4332 {
4333 gimple *stmt = gsi_stmt (gsi);
4334 if (is_gimple_debug (stmt))
4335 continue;
4336
4337 insns++;
4338
4339 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4340 vect_location = stmt;
4341
4342 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4343 &dataref_groups, current_group))
4344 ++current_group;
4345 }
4346 }
4347
4348 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4349 }
4350
4351 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4352 true if anything in the basic-block was vectorized. */
4353
4354 bool
4355 vect_slp_bb (basic_block bb)
4356 {
4357 auto_vec<basic_block> bbs;
4358 bbs.safe_push (bb);
4359 return vect_slp_bbs (bbs);
4360 }
4361
4362 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4363 true if anything in the basic-block was vectorized. */
4364
4365 bool
4366 vect_slp_function (function *fun)
4367 {
4368 bool r = false;
4369 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4370 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4371
4372 /* For the moment split the function into pieces to avoid making
4373 the iteration on the vector mode moot. Split at points we know
4374 to not handle well which is CFG merges (SLP discovery doesn't
4375 handle non-loop-header PHIs) and loop exits. Since pattern
4376 recog requires reverse iteration to visit uses before defs
4377 simply chop RPO into pieces. */
4378 auto_vec<basic_block> bbs;
4379 for (unsigned i = 0; i < n; i++)
4380 {
4381 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4382 bool split = false;
4383
4384 /* Split when a BB is not dominated by the first block. */
4385 if (!bbs.is_empty ()
4386 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4387 {
4388 if (dump_enabled_p ())
4389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4390 "splitting region at dominance boundary bb%d\n",
4391 bb->index);
4392 split = true;
4393 }
4394 /* Split when the loop determined by the first block
4395 is exited. This is because we eventually insert
4396 invariants at region begin. */
4397 else if (!bbs.is_empty ()
4398 && bbs[0]->loop_father != bb->loop_father
4399 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4400 {
4401 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4403 "splitting region at loop %d exit at bb%d\n",
4404 bbs[0]->loop_father->num, bb->index);
4405 split = true;
4406 }
4407
4408 if (split && !bbs.is_empty ())
4409 {
4410 r |= vect_slp_bbs (bbs);
4411 bbs.truncate (0);
4412 bbs.quick_push (bb);
4413 }
4414 else
4415 bbs.safe_push (bb);
4416
4417 /* When we have a stmt ending this block and defining a
4418 value we have to insert on edges when inserting after it for
4419 a vector containing its definition. Avoid this for now. */
4420 if (gimple *last = last_stmt (bb))
4421 if (gimple_get_lhs (last)
4422 && is_ctrl_altering_stmt (last))
4423 {
4424 if (dump_enabled_p ())
4425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4426 "splitting region at control altering "
4427 "definition %G", last);
4428 r |= vect_slp_bbs (bbs);
4429 bbs.truncate (0);
4430 }
4431 }
4432
4433 if (!bbs.is_empty ())
4434 r |= vect_slp_bbs (bbs);
4435
4436 free (rpo);
4437
4438 return r;
4439 }
4440
4441 /* Build a variable-length vector in which the elements in ELTS are repeated
4442 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4443 RESULTS and add any new instructions to SEQ.
4444
4445 The approach we use is:
4446
4447 (1) Find a vector mode VM with integer elements of mode IM.
4448
4449 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4450 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4451 from small vectors to IM.
4452
4453 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4454
4455 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4456 correct byte contents.
4457
4458 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4459
4460 We try to find the largest IM for which this sequence works, in order
4461 to cut down on the number of interleaves. */
4462
4463 void
4464 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4465 vec<tree> elts, unsigned int nresults,
4466 vec<tree> &results)
4467 {
4468 unsigned int nelts = elts.length ();
4469 tree element_type = TREE_TYPE (vector_type);
4470
4471 /* (1) Find a vector mode VM with integer elements of mode IM. */
4472 unsigned int nvectors = 1;
4473 tree new_vector_type;
4474 tree permutes[2];
4475 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4476 &nvectors, &new_vector_type,
4477 permutes))
4478 gcc_unreachable ();
4479
4480 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4481 unsigned int partial_nelts = nelts / nvectors;
4482 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4483
4484 tree_vector_builder partial_elts;
4485 auto_vec<tree, 32> pieces (nvectors * 2);
4486 pieces.quick_grow (nvectors * 2);
4487 for (unsigned int i = 0; i < nvectors; ++i)
4488 {
4489 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4490 ELTS' has mode IM. */
4491 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4492 for (unsigned int j = 0; j < partial_nelts; ++j)
4493 partial_elts.quick_push (elts[i * partial_nelts + j]);
4494 tree t = gimple_build_vector (seq, &partial_elts);
4495 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4496 TREE_TYPE (new_vector_type), t);
4497
4498 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4499 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4500 }
4501
4502 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4503 correct byte contents.
4504
4505 We need to repeat the following operation log2(nvectors) times:
4506
4507 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4508 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4509
4510 However, if each input repeats every N elements and the VF is
4511 a multiple of N * 2, the HI result is the same as the LO. */
4512 unsigned int in_start = 0;
4513 unsigned int out_start = nvectors;
4514 unsigned int hi_start = nvectors / 2;
4515 /* A bound on the number of outputs needed to produce NRESULTS results
4516 in the final iteration. */
4517 unsigned int noutputs_bound = nvectors * nresults;
4518 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4519 {
4520 noutputs_bound /= 2;
4521 unsigned int limit = MIN (noutputs_bound, nvectors);
4522 for (unsigned int i = 0; i < limit; ++i)
4523 {
4524 if ((i & 1) != 0
4525 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4526 2 * in_repeat))
4527 {
4528 pieces[out_start + i] = pieces[out_start + i - 1];
4529 continue;
4530 }
4531
4532 tree output = make_ssa_name (new_vector_type);
4533 tree input1 = pieces[in_start + (i / 2)];
4534 tree input2 = pieces[in_start + (i / 2) + hi_start];
4535 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4536 input1, input2,
4537 permutes[i & 1]);
4538 gimple_seq_add_stmt (seq, stmt);
4539 pieces[out_start + i] = output;
4540 }
4541 std::swap (in_start, out_start);
4542 }
4543
4544 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4545 results.reserve (nresults);
4546 for (unsigned int i = 0; i < nresults; ++i)
4547 if (i < nvectors)
4548 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4549 pieces[in_start + i]));
4550 else
4551 results.quick_push (results[i - nvectors]);
4552 }
4553
4554
4555 /* For constant and loop invariant defs in OP_NODE this function creates
4556 vector defs that will be used in the vectorized stmts and stores them
4557 to SLP_TREE_VEC_DEFS of OP_NODE. */
4558
4559 static void
4560 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4561 {
4562 unsigned HOST_WIDE_INT nunits;
4563 tree vec_cst;
4564 unsigned j, number_of_places_left_in_vector;
4565 tree vector_type;
4566 tree vop;
4567 int group_size = op_node->ops.length ();
4568 unsigned int vec_num, i;
4569 unsigned number_of_copies = 1;
4570 bool constant_p;
4571 gimple_seq ctor_seq = NULL;
4572 auto_vec<tree, 16> permute_results;
4573
4574 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4575 vector_type = SLP_TREE_VECTYPE (op_node);
4576
4577 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4578 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4579 auto_vec<tree> voprnds (number_of_vectors);
4580
4581 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4582 created vectors. It is greater than 1 if unrolling is performed.
4583
4584 For example, we have two scalar operands, s1 and s2 (e.g., group of
4585 strided accesses of size two), while NUNITS is four (i.e., four scalars
4586 of this type can be packed in a vector). The output vector will contain
4587 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4588 will be 2).
4589
4590 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4591 containing the operands.
4592
4593 For example, NUNITS is four as before, and the group size is 8
4594 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4595 {s5, s6, s7, s8}. */
4596
4597 /* When using duplicate_and_interleave, we just need one element for
4598 each scalar statement. */
4599 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4600 nunits = group_size;
4601
4602 number_of_copies = nunits * number_of_vectors / group_size;
4603
4604 number_of_places_left_in_vector = nunits;
4605 constant_p = true;
4606 tree_vector_builder elts (vector_type, nunits, 1);
4607 elts.quick_grow (nunits);
4608 stmt_vec_info insert_after = NULL;
4609 for (j = 0; j < number_of_copies; j++)
4610 {
4611 tree op;
4612 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4613 {
4614 /* Create 'vect_ = {op0,op1,...,opn}'. */
4615 number_of_places_left_in_vector--;
4616 tree orig_op = op;
4617 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4618 {
4619 if (CONSTANT_CLASS_P (op))
4620 {
4621 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4622 {
4623 /* Can't use VIEW_CONVERT_EXPR for booleans because
4624 of possibly different sizes of scalar value and
4625 vector element. */
4626 if (integer_zerop (op))
4627 op = build_int_cst (TREE_TYPE (vector_type), 0);
4628 else if (integer_onep (op))
4629 op = build_all_ones_cst (TREE_TYPE (vector_type));
4630 else
4631 gcc_unreachable ();
4632 }
4633 else
4634 op = fold_unary (VIEW_CONVERT_EXPR,
4635 TREE_TYPE (vector_type), op);
4636 gcc_assert (op && CONSTANT_CLASS_P (op));
4637 }
4638 else
4639 {
4640 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4641 gimple *init_stmt;
4642 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4643 {
4644 tree true_val
4645 = build_all_ones_cst (TREE_TYPE (vector_type));
4646 tree false_val
4647 = build_zero_cst (TREE_TYPE (vector_type));
4648 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4649 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4650 op, true_val,
4651 false_val);
4652 }
4653 else
4654 {
4655 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4656 op);
4657 init_stmt
4658 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4659 op);
4660 }
4661 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4662 op = new_temp;
4663 }
4664 }
4665 elts[number_of_places_left_in_vector] = op;
4666 if (!CONSTANT_CLASS_P (op))
4667 constant_p = false;
4668 /* For BB vectorization we have to compute an insert location
4669 when a def is inside the analyzed region since we cannot
4670 simply insert at the BB start in this case. */
4671 stmt_vec_info opdef;
4672 if (TREE_CODE (orig_op) == SSA_NAME
4673 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4674 && is_a <bb_vec_info> (vinfo)
4675 && (opdef = vinfo->lookup_def (orig_op)))
4676 {
4677 if (!insert_after)
4678 insert_after = opdef;
4679 else
4680 insert_after = get_later_stmt (insert_after, opdef);
4681 }
4682
4683 if (number_of_places_left_in_vector == 0)
4684 {
4685 if (constant_p
4686 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4687 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4688 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4689 else
4690 {
4691 if (permute_results.is_empty ())
4692 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4693 elts, number_of_vectors,
4694 permute_results);
4695 vec_cst = permute_results[number_of_vectors - j - 1];
4696 }
4697 if (!gimple_seq_empty_p (ctor_seq))
4698 {
4699 if (insert_after)
4700 {
4701 gimple_stmt_iterator gsi;
4702 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4703 {
4704 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4705 gsi_insert_seq_before (&gsi, ctor_seq,
4706 GSI_CONTINUE_LINKING);
4707 }
4708 else if (!stmt_ends_bb_p (insert_after->stmt))
4709 {
4710 gsi = gsi_for_stmt (insert_after->stmt);
4711 gsi_insert_seq_after (&gsi, ctor_seq,
4712 GSI_CONTINUE_LINKING);
4713 }
4714 else
4715 {
4716 /* When we want to insert after a def where the
4717 defining stmt throws then insert on the fallthru
4718 edge. */
4719 edge e = find_fallthru_edge
4720 (gimple_bb (insert_after->stmt)->succs);
4721 basic_block new_bb
4722 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4723 gcc_assert (!new_bb);
4724 }
4725 }
4726 else
4727 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4728 ctor_seq = NULL;
4729 }
4730 voprnds.quick_push (vec_cst);
4731 insert_after = NULL;
4732 number_of_places_left_in_vector = nunits;
4733 constant_p = true;
4734 elts.new_vector (vector_type, nunits, 1);
4735 elts.quick_grow (nunits);
4736 }
4737 }
4738 }
4739
4740 /* Since the vectors are created in the reverse order, we should invert
4741 them. */
4742 vec_num = voprnds.length ();
4743 for (j = vec_num; j != 0; j--)
4744 {
4745 vop = voprnds[j - 1];
4746 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4747 }
4748
4749 /* In case that VF is greater than the unrolling factor needed for the SLP
4750 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4751 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4752 to replicate the vectors. */
4753 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4754 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4755 i++)
4756 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4757 }
4758
4759 /* Get the Ith vectorized definition from SLP_NODE. */
4760
4761 tree
4762 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4763 {
4764 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4765 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4766 else
4767 return SLP_TREE_VEC_DEFS (slp_node)[i];
4768 }
4769
4770 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4771
4772 void
4773 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4774 {
4775 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4776 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4777 {
4778 unsigned j;
4779 gimple *vec_def_stmt;
4780 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4781 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4782 }
4783 else
4784 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4785 }
4786
4787 /* Get N vectorized definitions for SLP_NODE. */
4788
4789 void
4790 vect_get_slp_defs (vec_info *,
4791 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4792 {
4793 if (n == -1U)
4794 n = SLP_TREE_CHILDREN (slp_node).length ();
4795
4796 for (unsigned i = 0; i < n; ++i)
4797 {
4798 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4799 vec<tree> vec_defs = vNULL;
4800 vect_get_slp_defs (child, &vec_defs);
4801 vec_oprnds->quick_push (vec_defs);
4802 }
4803 }
4804
4805 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4806 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4807 permute statements for the SLP node NODE. */
4808
4809 bool
4810 vect_transform_slp_perm_load (vec_info *vinfo,
4811 slp_tree node, vec<tree> dr_chain,
4812 gimple_stmt_iterator *gsi, poly_uint64 vf,
4813 bool analyze_only, unsigned *n_perms)
4814 {
4815 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4816 int vec_index = 0;
4817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4818 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4819 unsigned int mask_element;
4820 machine_mode mode;
4821
4822 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4823 return false;
4824
4825 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4826
4827 mode = TYPE_MODE (vectype);
4828 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4829
4830 /* Initialize the vect stmts of NODE to properly insert the generated
4831 stmts later. */
4832 if (! analyze_only)
4833 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4834 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4835 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4836
4837 /* Generate permutation masks for every NODE. Number of masks for each NODE
4838 is equal to GROUP_SIZE.
4839 E.g., we have a group of three nodes with three loads from the same
4840 location in each node, and the vector size is 4. I.e., we have a
4841 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4842 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4843 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4844 ...
4845
4846 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4847 The last mask is illegal since we assume two operands for permute
4848 operation, and the mask element values can't be outside that range.
4849 Hence, the last mask must be converted into {2,5,5,5}.
4850 For the first two permutations we need the first and the second input
4851 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4852 we need the second and the third vectors: {b1,c1,a2,b2} and
4853 {c2,a3,b3,c3}. */
4854
4855 int vect_stmts_counter = 0;
4856 unsigned int index = 0;
4857 int first_vec_index = -1;
4858 int second_vec_index = -1;
4859 bool noop_p = true;
4860 *n_perms = 0;
4861
4862 vec_perm_builder mask;
4863 unsigned int nelts_to_build;
4864 unsigned int nvectors_per_build;
4865 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4866 && multiple_p (nunits, group_size));
4867 if (repeating_p)
4868 {
4869 /* A single vector contains a whole number of copies of the node, so:
4870 (a) all permutes can use the same mask; and
4871 (b) the permutes only need a single vector input. */
4872 mask.new_vector (nunits, group_size, 3);
4873 nelts_to_build = mask.encoded_nelts ();
4874 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4875 }
4876 else
4877 {
4878 /* We need to construct a separate mask for each vector statement. */
4879 unsigned HOST_WIDE_INT const_nunits, const_vf;
4880 if (!nunits.is_constant (&const_nunits)
4881 || !vf.is_constant (&const_vf))
4882 return false;
4883 mask.new_vector (const_nunits, const_nunits, 1);
4884 nelts_to_build = const_vf * group_size;
4885 nvectors_per_build = 1;
4886 }
4887
4888 unsigned int count = mask.encoded_nelts ();
4889 mask.quick_grow (count);
4890 vec_perm_indices indices;
4891
4892 for (unsigned int j = 0; j < nelts_to_build; j++)
4893 {
4894 unsigned int iter_num = j / group_size;
4895 unsigned int stmt_num = j % group_size;
4896 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4897 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4898 if (repeating_p)
4899 {
4900 first_vec_index = 0;
4901 mask_element = i;
4902 }
4903 else
4904 {
4905 /* Enforced before the loop when !repeating_p. */
4906 unsigned int const_nunits = nunits.to_constant ();
4907 vec_index = i / const_nunits;
4908 mask_element = i % const_nunits;
4909 if (vec_index == first_vec_index
4910 || first_vec_index == -1)
4911 {
4912 first_vec_index = vec_index;
4913 }
4914 else if (vec_index == second_vec_index
4915 || second_vec_index == -1)
4916 {
4917 second_vec_index = vec_index;
4918 mask_element += const_nunits;
4919 }
4920 else
4921 {
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4924 "permutation requires at "
4925 "least three vectors %G",
4926 stmt_info->stmt);
4927 gcc_assert (analyze_only);
4928 return false;
4929 }
4930
4931 gcc_assert (mask_element < 2 * const_nunits);
4932 }
4933
4934 if (mask_element != index)
4935 noop_p = false;
4936 mask[index++] = mask_element;
4937
4938 if (index == count && !noop_p)
4939 {
4940 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4941 if (!can_vec_perm_const_p (mode, indices))
4942 {
4943 if (dump_enabled_p ())
4944 {
4945 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4946 vect_location,
4947 "unsupported vect permute { ");
4948 for (i = 0; i < count; ++i)
4949 {
4950 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4951 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4952 }
4953 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4954 }
4955 gcc_assert (analyze_only);
4956 return false;
4957 }
4958
4959 ++*n_perms;
4960 }
4961
4962 if (index == count)
4963 {
4964 if (!analyze_only)
4965 {
4966 tree mask_vec = NULL_TREE;
4967
4968 if (! noop_p)
4969 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4970
4971 if (second_vec_index == -1)
4972 second_vec_index = first_vec_index;
4973
4974 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4975 {
4976 /* Generate the permute statement if necessary. */
4977 tree first_vec = dr_chain[first_vec_index + ri];
4978 tree second_vec = dr_chain[second_vec_index + ri];
4979 gimple *perm_stmt;
4980 if (! noop_p)
4981 {
4982 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4983 tree perm_dest
4984 = vect_create_destination_var (gimple_assign_lhs (stmt),
4985 vectype);
4986 perm_dest = make_ssa_name (perm_dest);
4987 perm_stmt
4988 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4989 first_vec, second_vec,
4990 mask_vec);
4991 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4992 gsi);
4993 }
4994 else
4995 /* If mask was NULL_TREE generate the requested
4996 identity transform. */
4997 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4998
4999 /* Store the vector statement in NODE. */
5000 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5001 }
5002 }
5003
5004 index = 0;
5005 first_vec_index = -1;
5006 second_vec_index = -1;
5007 noop_p = true;
5008 }
5009 }
5010
5011 return true;
5012 }
5013
5014
5015 /* Vectorize the SLP permutations in NODE as specified
5016 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5017 child number and lane number.
5018 Interleaving of two two-lane two-child SLP subtrees (not supported):
5019 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5020 A blend of two four-lane two-child SLP subtrees:
5021 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5022 Highpart of a four-lane one-child SLP subtree (not supported):
5023 [ { 0, 2 }, { 0, 3 } ]
5024 Where currently only a subset is supported by code generating below. */
5025
5026 static bool
5027 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5028 slp_tree node, stmt_vector_for_cost *cost_vec)
5029 {
5030 tree vectype = SLP_TREE_VECTYPE (node);
5031
5032 /* ??? We currently only support all same vector input and output types
5033 while the SLP IL should really do a concat + select and thus accept
5034 arbitrary mismatches. */
5035 slp_tree child;
5036 unsigned i;
5037 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5038 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5039 {
5040 if (dump_enabled_p ())
5041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5042 "Unsupported lane permutation\n");
5043 return false;
5044 }
5045
5046 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5047 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5048 if (dump_enabled_p ())
5049 {
5050 dump_printf_loc (MSG_NOTE, vect_location,
5051 "vectorizing permutation");
5052 for (unsigned i = 0; i < perm.length (); ++i)
5053 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5054 dump_printf (MSG_NOTE, "\n");
5055 }
5056
5057 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5058 if (!nunits.is_constant ())
5059 return false;
5060 unsigned HOST_WIDE_INT vf = 1;
5061 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5062 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5063 return false;
5064 unsigned olanes = vf * SLP_TREE_LANES (node);
5065 gcc_assert (multiple_p (olanes, nunits));
5066
5067 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5068 from the { SLP operand, scalar lane } permutation as recorded in the
5069 SLP node as intermediate step. This part should already work
5070 with SLP children with arbitrary number of lanes. */
5071 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5072 auto_vec<unsigned> active_lane;
5073 vperm.create (olanes);
5074 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5075 for (unsigned i = 0; i < vf; ++i)
5076 {
5077 for (unsigned pi = 0; pi < perm.length (); ++pi)
5078 {
5079 std::pair<unsigned, unsigned> p = perm[pi];
5080 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5081 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5082 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5083 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5084 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5085 }
5086 /* Advance to the next group. */
5087 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5088 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5089 }
5090
5091 if (dump_enabled_p ())
5092 {
5093 dump_printf_loc (MSG_NOTE, vect_location, "as");
5094 for (unsigned i = 0; i < vperm.length (); ++i)
5095 {
5096 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5097 dump_printf (MSG_NOTE, ",");
5098 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5099 vperm[i].first.first, vperm[i].first.second,
5100 vperm[i].first.second);
5101 }
5102 dump_printf (MSG_NOTE, "\n");
5103 }
5104
5105 /* We can only handle two-vector permutes, everything else should
5106 be lowered on the SLP level. The following is closely inspired
5107 by vect_transform_slp_perm_load and is supposed to eventually
5108 replace it.
5109 ??? As intermediate step do code-gen in the SLP tree representation
5110 somehow? */
5111 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5112 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5113 unsigned int const_nunits = nunits.to_constant ();
5114 unsigned int index = 0;
5115 unsigned int mask_element;
5116 vec_perm_builder mask;
5117 mask.new_vector (const_nunits, const_nunits, 1);
5118 unsigned int count = mask.encoded_nelts ();
5119 mask.quick_grow (count);
5120 vec_perm_indices indices;
5121 unsigned nperms = 0;
5122 for (unsigned i = 0; i < vperm.length (); ++i)
5123 {
5124 mask_element = vperm[i].second;
5125 if (first_vec.first == -1U
5126 || first_vec == vperm[i].first)
5127 first_vec = vperm[i].first;
5128 else if (second_vec.first == -1U
5129 || second_vec == vperm[i].first)
5130 {
5131 second_vec = vperm[i].first;
5132 mask_element += const_nunits;
5133 }
5134 else
5135 {
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5138 "permutation requires at "
5139 "least three vectors");
5140 gcc_assert (!gsi);
5141 return false;
5142 }
5143
5144 mask[index++] = mask_element;
5145
5146 if (index == count)
5147 {
5148 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5149 const_nunits);
5150 bool identity_p = indices.series_p (0, 1, 0, 1);
5151 if (!identity_p
5152 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5153 {
5154 if (dump_enabled_p ())
5155 {
5156 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5157 vect_location,
5158 "unsupported vect permute { ");
5159 for (i = 0; i < count; ++i)
5160 {
5161 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5162 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5163 }
5164 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5165 }
5166 gcc_assert (!gsi);
5167 return false;
5168 }
5169
5170 if (!identity_p)
5171 nperms++;
5172 if (gsi)
5173 {
5174 if (second_vec.first == -1U)
5175 second_vec = first_vec;
5176
5177 /* Generate the permute statement if necessary. */
5178 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5179 tree first_def
5180 = vect_get_slp_vect_def (first_node, first_vec.second);
5181 gassign *perm_stmt;
5182 tree perm_dest = make_ssa_name (vectype);
5183 if (!identity_p)
5184 {
5185 slp_tree second_node
5186 = SLP_TREE_CHILDREN (node)[second_vec.first];
5187 tree second_def
5188 = vect_get_slp_vect_def (second_node, second_vec.second);
5189 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5190 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5191 first_def, second_def,
5192 mask_vec);
5193 }
5194 else
5195 /* We need a copy here in case the def was external. */
5196 perm_stmt = gimple_build_assign (perm_dest, first_def);
5197 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5198 /* Store the vector statement in NODE. */
5199 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5200 }
5201
5202 index = 0;
5203 first_vec = std::make_pair (-1U, -1U);
5204 second_vec = std::make_pair (-1U, -1U);
5205 }
5206 }
5207
5208 if (!gsi)
5209 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5210
5211 return true;
5212 }
5213
5214 /* Vectorize SLP NODE. */
5215
5216 static void
5217 vect_schedule_slp_node (vec_info *vinfo,
5218 slp_tree node, slp_instance instance)
5219 {
5220 gimple_stmt_iterator si;
5221 int i;
5222 slp_tree child;
5223
5224 /* For existing vectors there's nothing to do. */
5225 if (SLP_TREE_VEC_DEFS (node).exists ())
5226 return;
5227
5228 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5229
5230 /* Vectorize externals and constants. */
5231 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5232 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5233 {
5234 /* ??? vectorizable_shift can end up using a scalar operand which is
5235 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5236 node in this case. */
5237 if (!SLP_TREE_VECTYPE (node))
5238 return;
5239
5240 vect_create_constant_vectors (vinfo, node);
5241 return;
5242 }
5243
5244 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5245
5246 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5247 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5248
5249 if (dump_enabled_p ())
5250 dump_printf_loc (MSG_NOTE, vect_location,
5251 "------>vectorizing SLP node starting from: %G",
5252 stmt_info->stmt);
5253
5254 if (STMT_VINFO_DATA_REF (stmt_info)
5255 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5256 {
5257 /* Vectorized loads go before the first scalar load to make it
5258 ready early, vectorized stores go before the last scalar
5259 stmt which is where all uses are ready. */
5260 stmt_vec_info last_stmt_info = NULL;
5261 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5262 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5263 else /* DR_IS_WRITE */
5264 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5265 si = gsi_for_stmt (last_stmt_info->stmt);
5266 }
5267 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5268 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5269 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5270 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5271 {
5272 /* For PHI node vectorization we do not use the insertion iterator. */
5273 si = gsi_none ();
5274 }
5275 else
5276 {
5277 /* Emit other stmts after the children vectorized defs which is
5278 earliest possible. */
5279 gimple *last_stmt = NULL;
5280 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5281 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5282 {
5283 /* For fold-left reductions we are retaining the scalar
5284 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5285 set so the representation isn't perfect. Resort to the
5286 last scalar def here. */
5287 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5288 {
5289 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5290 == cycle_phi_info_type);
5291 gphi *phi = as_a <gphi *>
5292 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5293 if (!last_stmt
5294 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5295 last_stmt = phi;
5296 }
5297 /* We are emitting all vectorized stmts in the same place and
5298 the last one is the last.
5299 ??? Unless we have a load permutation applied and that
5300 figures to re-use an earlier generated load. */
5301 unsigned j;
5302 gimple *vstmt;
5303 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5304 if (!last_stmt
5305 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5306 last_stmt = vstmt;
5307 }
5308 else if (!SLP_TREE_VECTYPE (child))
5309 {
5310 /* For externals we use unvectorized at all scalar defs. */
5311 unsigned j;
5312 tree def;
5313 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5314 if (TREE_CODE (def) == SSA_NAME
5315 && !SSA_NAME_IS_DEFAULT_DEF (def))
5316 {
5317 gimple *stmt = SSA_NAME_DEF_STMT (def);
5318 if (!last_stmt
5319 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5320 last_stmt = stmt;
5321 }
5322 }
5323 else
5324 {
5325 /* For externals we have to look at all defs since their
5326 insertion place is decided per vector. But beware
5327 of pre-existing vectors where we need to make sure
5328 we do not insert before the region boundary. */
5329 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5330 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5331 last_stmt = gsi_stmt (gsi_after_labels
5332 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5333 else
5334 {
5335 unsigned j;
5336 tree vdef;
5337 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5338 if (TREE_CODE (vdef) == SSA_NAME
5339 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5340 {
5341 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5342 if (!last_stmt
5343 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5344 last_stmt = vstmt;
5345 }
5346 }
5347 }
5348 /* This can happen when all children are pre-existing vectors or
5349 constants. */
5350 if (!last_stmt)
5351 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5352 if (is_a <gphi *> (last_stmt))
5353 si = gsi_after_labels (gimple_bb (last_stmt));
5354 else
5355 {
5356 si = gsi_for_stmt (last_stmt);
5357 gsi_next (&si);
5358 }
5359 }
5360
5361 bool done_p = false;
5362
5363 /* Handle purely internal nodes. */
5364 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5365 {
5366 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5367 be shared with different SLP nodes (but usually it's the same
5368 operation apart from the case the stmt is only there for denoting
5369 the actual scalar lane defs ...). So do not call vect_transform_stmt
5370 but open-code it here (partly). */
5371 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5372 gcc_assert (done);
5373 done_p = true;
5374 }
5375 if (!done_p)
5376 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5377 }
5378
5379 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5380 For loop vectorization this is done in vectorizable_call, but for SLP
5381 it needs to be deferred until end of vect_schedule_slp, because multiple
5382 SLP instances may refer to the same scalar stmt. */
5383
5384 static void
5385 vect_remove_slp_scalar_calls (vec_info *vinfo,
5386 slp_tree node, hash_set<slp_tree> &visited)
5387 {
5388 gimple *new_stmt;
5389 gimple_stmt_iterator gsi;
5390 int i;
5391 slp_tree child;
5392 tree lhs;
5393 stmt_vec_info stmt_info;
5394
5395 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5396 return;
5397
5398 if (visited.add (node))
5399 return;
5400
5401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5402 vect_remove_slp_scalar_calls (vinfo, child, visited);
5403
5404 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5405 {
5406 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5407 if (!stmt || gimple_bb (stmt) == NULL)
5408 continue;
5409 if (is_pattern_stmt_p (stmt_info)
5410 || !PURE_SLP_STMT (stmt_info))
5411 continue;
5412 lhs = gimple_call_lhs (stmt);
5413 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5414 gsi = gsi_for_stmt (stmt);
5415 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5416 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5417 }
5418 }
5419
5420 static void
5421 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5422 {
5423 hash_set<slp_tree> visited;
5424 vect_remove_slp_scalar_calls (vinfo, node, visited);
5425 }
5426
5427 /* Vectorize the instance root. */
5428
5429 void
5430 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5431 {
5432 gassign *rstmt = NULL;
5433
5434 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5435 {
5436 gimple *child_stmt;
5437 int j;
5438
5439 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5440 {
5441 tree vect_lhs = gimple_get_lhs (child_stmt);
5442 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5443 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5444 TREE_TYPE (vect_lhs)))
5445 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5446 vect_lhs);
5447 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5448 break;
5449 }
5450 }
5451 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5452 {
5453 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5454 gimple *child_stmt;
5455 int j;
5456 vec<constructor_elt, va_gc> *v;
5457 vec_alloc (v, nelts);
5458
5459 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5460 {
5461 CONSTRUCTOR_APPEND_ELT (v,
5462 NULL_TREE,
5463 gimple_get_lhs (child_stmt));
5464 }
5465 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5466 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5467 tree r_constructor = build_constructor (rtype, v);
5468 rstmt = gimple_build_assign (lhs, r_constructor);
5469 }
5470
5471 gcc_assert (rstmt);
5472
5473 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5474 gsi_replace (&rgsi, rstmt, true);
5475 }
5476
5477 struct slp_scc_info
5478 {
5479 bool on_stack;
5480 int dfs;
5481 int lowlink;
5482 };
5483
5484 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5485
5486 static void
5487 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5488 hash_map<slp_tree, slp_scc_info> &scc_info,
5489 int &maxdfs, vec<slp_tree> &stack)
5490 {
5491 bool existed_p;
5492 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5493 gcc_assert (!existed_p);
5494 info->dfs = maxdfs;
5495 info->lowlink = maxdfs;
5496 info->on_stack = true;
5497 maxdfs++;
5498 stack.safe_push (node);
5499 unsigned i;
5500 slp_tree child;
5501
5502 /* ??? We're keeping SLP_TREE_CHILDREN of externalized nodes. */
5503 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5504 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5505 {
5506 if (!child)
5507 continue;
5508 slp_scc_info *child_info = scc_info.get (child);
5509 if (!child_info)
5510 {
5511 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5512 /* Recursion might have re-allocated the node. */
5513 info = scc_info.get (node);
5514 child_info = scc_info.get (child);
5515 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5516 }
5517 else if (child_info->on_stack)
5518 info->lowlink = MIN (info->lowlink, child_info->dfs);
5519 }
5520 if (info->lowlink != info->dfs)
5521 return;
5522
5523 /* Singleton. */
5524 if (stack.last () == node)
5525 {
5526 stack.pop ();
5527 info->on_stack = false;
5528 vect_schedule_slp_node (vinfo, node, instance);
5529 return;
5530 }
5531 /* SCC. */
5532 int last_idx = stack.length () - 1;
5533 while (stack[last_idx] != node)
5534 last_idx--;
5535 /* We can break the cycle at PHIs who have at least one child
5536 code generated. Then we could re-start the DFS walk until
5537 all nodes in the SCC are covered (we might have new entries
5538 for only back-reachable nodes). But it's simpler to just
5539 iterate and schedule those that are ready. */
5540 auto_vec<slp_tree, 4> phis_to_fixup;
5541 unsigned todo = stack.length () - last_idx;
5542 do
5543 {
5544 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5545 {
5546 slp_tree entry = stack[idx];
5547 if (!entry)
5548 continue;
5549 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5550 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5551 bool ready = !phi;
5552 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5553 if (!child)
5554 {
5555 gcc_assert (phi);
5556 ready = true;
5557 break;
5558 }
5559 else if (scc_info.get (child)->on_stack)
5560 {
5561 if (!phi)
5562 {
5563 ready = false;
5564 break;
5565 }
5566 }
5567 else
5568 {
5569 if (phi)
5570 {
5571 ready = true;
5572 break;
5573 }
5574 }
5575 if (ready)
5576 {
5577 vect_schedule_slp_node (vinfo, entry, instance);
5578 scc_info.get (entry)->on_stack = false;
5579 stack[idx] = NULL;
5580 todo--;
5581 if (phi)
5582 phis_to_fixup.safe_push (entry);
5583 }
5584 }
5585 }
5586 while (todo != 0);
5587
5588 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5589 slp_tree phi_node;
5590 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5591 {
5592 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5593 edge_iterator ei;
5594 edge e;
5595 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5596 {
5597 unsigned dest_idx = e->dest_idx;
5598 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5599 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5600 continue;
5601 /* Simply fill all args. */
5602 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5603 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5604 vect_get_slp_vect_def (child, i),
5605 e, gimple_phi_arg_location (phi, dest_idx));
5606 }
5607 }
5608
5609 /* Pop the SCC. */
5610 stack.truncate (last_idx);
5611 }
5612
5613 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5614
5615 void
5616 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5617 {
5618 slp_instance instance;
5619 unsigned int i;
5620
5621 hash_map<slp_tree, slp_scc_info> scc_info;
5622 int maxdfs = 0;
5623 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5624 {
5625 slp_tree node = SLP_INSTANCE_TREE (instance);
5626 if (dump_enabled_p ())
5627 {
5628 dump_printf_loc (MSG_NOTE, vect_location,
5629 "Vectorizing SLP tree:\n");
5630 if (SLP_INSTANCE_ROOT_STMT (instance))
5631 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5632 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5633 vect_print_slp_graph (MSG_NOTE, vect_location,
5634 SLP_INSTANCE_TREE (instance));
5635 }
5636 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5637 have a PHI be the node breaking the cycle. */
5638 auto_vec<slp_tree> stack;
5639 if (!scc_info.get (node))
5640 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5641
5642 if (SLP_INSTANCE_ROOT_STMT (instance))
5643 vectorize_slp_instance_root_stmt (node, instance);
5644
5645 if (dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE, vect_location,
5647 "vectorizing stmts using SLP.\n");
5648 }
5649
5650 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5651 {
5652 slp_tree root = SLP_INSTANCE_TREE (instance);
5653 stmt_vec_info store_info;
5654 unsigned int j;
5655
5656 /* Remove scalar call stmts. Do not do this for basic-block
5657 vectorization as not all uses may be vectorized.
5658 ??? Why should this be necessary? DCE should be able to
5659 remove the stmts itself.
5660 ??? For BB vectorization we can as well remove scalar
5661 stmts starting from the SLP tree root if they have no
5662 uses. */
5663 if (is_a <loop_vec_info> (vinfo))
5664 vect_remove_slp_scalar_calls (vinfo, root);
5665
5666 /* Remove vectorized stores original scalar stmts. */
5667 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5668 {
5669 if (!STMT_VINFO_DATA_REF (store_info)
5670 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5671 break;
5672
5673 store_info = vect_orig_stmt (store_info);
5674 /* Free the attached stmt_vec_info and remove the stmt. */
5675 vinfo->remove_stmt (store_info);
5676 }
5677 }
5678 }