Rewrite SLP induction vectorization
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51
52 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
53 slp_tree, stmt_vector_for_cost *);
54
55 object_allocator<_slp_tree> *slp_tree_pool;
56
57 void *
58 _slp_tree::operator new (size_t n)
59 {
60 gcc_assert (n == sizeof (_slp_tree));
61 return slp_tree_pool->allocate_raw ();
62 }
63
64 void
65 _slp_tree::operator delete (void *node, size_t n)
66 {
67 gcc_assert (n == sizeof (_slp_tree));
68 slp_tree_pool->remove_raw (node);
69 }
70
71
72 /* Initialize a SLP node. */
73
74 _slp_tree::_slp_tree ()
75 {
76 SLP_TREE_SCALAR_STMTS (this) = vNULL;
77 SLP_TREE_SCALAR_OPS (this) = vNULL;
78 SLP_TREE_VEC_STMTS (this) = vNULL;
79 SLP_TREE_VEC_DEFS (this) = vNULL;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
85 SLP_TREE_CODE (this) = ERROR_MARK;
86 SLP_TREE_VECTYPE (this) = NULL_TREE;
87 SLP_TREE_REPRESENTATIVE (this) = NULL;
88 SLP_TREE_REF_COUNT (this) = 1;
89 this->max_nunits = 1;
90 this->lanes = 0;
91 }
92
93 /* Tear down a SLP node. */
94
95 _slp_tree::~_slp_tree ()
96 {
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
104 }
105
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
107
108 static void
109 vect_free_slp_tree (slp_tree node)
110 {
111 int i;
112 slp_tree child;
113
114 if (--SLP_TREE_REF_COUNT (node) != 0)
115 return;
116
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
118 if (child)
119 vect_free_slp_tree (child);
120
121 delete node;
122 }
123
124 /* Return a location suitable for dumpings related to the SLP instance. */
125
126 dump_user_location_t
127 _slp_instance::location () const
128 {
129 if (root_stmt)
130 return root_stmt->stmt;
131 else
132 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
133 }
134
135
136 /* Free the memory allocated for the SLP instance. */
137
138 void
139 vect_free_slp_instance (slp_instance instance)
140 {
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
142 SLP_INSTANCE_LOADS (instance).release ();
143 instance->subgraph_entries.release ();
144 instance->cost_vec.release ();
145 free (instance);
146 }
147
148
149 /* Create an SLP node for SCALAR_STMTS. */
150
151 static slp_tree
152 vect_create_new_slp_node (slp_tree node,
153 vec<stmt_vec_info> scalar_stmts, unsigned nops)
154 {
155 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
156 SLP_TREE_CHILDREN (node).create (nops);
157 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
158 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
159 SLP_TREE_LANES (node) = scalar_stmts.length ();
160 return node;
161 }
162
163 /* Create an SLP node for SCALAR_STMTS. */
164
165 static slp_tree
166 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
167 {
168 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
169 }
170
171 /* Create an SLP node for OPS. */
172
173 static slp_tree
174 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
175 {
176 SLP_TREE_SCALAR_OPS (node) = ops;
177 SLP_TREE_DEF_TYPE (node) = vect_external_def;
178 SLP_TREE_LANES (node) = ops.length ();
179 return node;
180 }
181
182 /* Create an SLP node for OPS. */
183
184 static slp_tree
185 vect_create_new_slp_node (vec<tree> ops)
186 {
187 return vect_create_new_slp_node (new _slp_tree, ops);
188 }
189
190
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
193 node. */
194 typedef struct _slp_oprnd_info
195 {
196 /* Def-stmts for the operands. */
197 vec<stmt_vec_info> def_stmts;
198 /* Operands. */
199 vec<tree> ops;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
202 stmt. */
203 tree first_op_type;
204 enum vect_def_type first_dt;
205 bool any_pattern;
206 } *slp_oprnd_info;
207
208
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
210 operand. */
211 static vec<slp_oprnd_info>
212 vect_create_oprnd_info (int nops, int group_size)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216 vec<slp_oprnd_info> oprnds_info;
217
218 oprnds_info.create (nops);
219 for (i = 0; i < nops; i++)
220 {
221 oprnd_info = XNEW (struct _slp_oprnd_info);
222 oprnd_info->def_stmts.create (group_size);
223 oprnd_info->ops.create (group_size);
224 oprnd_info->first_dt = vect_uninitialized_def;
225 oprnd_info->first_op_type = NULL_TREE;
226 oprnd_info->any_pattern = false;
227 oprnds_info.quick_push (oprnd_info);
228 }
229
230 return oprnds_info;
231 }
232
233
234 /* Free operands info. */
235
236 static void
237 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
238 {
239 int i;
240 slp_oprnd_info oprnd_info;
241
242 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
243 {
244 oprnd_info->def_stmts.release ();
245 oprnd_info->ops.release ();
246 XDELETE (oprnd_info);
247 }
248
249 oprnds_info.release ();
250 }
251
252
253 /* Return true if STMTS contains a pattern statement. */
254
255 static bool
256 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
257 {
258 stmt_vec_info stmt_info;
259 unsigned int i;
260 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
261 if (is_pattern_stmt_p (stmt_info))
262 return true;
263 return false;
264 }
265
266 /* Return true when all lanes in the external or constant NODE have
267 the same value. */
268
269 static bool
270 vect_slp_tree_uniform_p (slp_tree node)
271 {
272 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
274
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
277 return false;
278
279 unsigned i;
280 tree op, first = NULL_TREE;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
282 if (!first)
283 first = op;
284 else if (!operand_equal_p (first, op, 0))
285 return false;
286
287 return true;
288 }
289
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
292 of the chain. */
293
294 int
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
296 stmt_vec_info first_stmt_info)
297 {
298 stmt_vec_info next_stmt_info = first_stmt_info;
299 int result = 0;
300
301 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
302 return -1;
303
304 do
305 {
306 if (next_stmt_info == stmt_info)
307 return result;
308 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
309 if (next_stmt_info)
310 result += DR_GROUP_GAP (next_stmt_info);
311 }
312 while (next_stmt_info);
313
314 return -1;
315 }
316
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
321 (if nonnull). */
322
323 bool
324 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
325 tree elt_type, unsigned int *nvectors_out,
326 tree *vector_type_out,
327 tree *permutes)
328 {
329 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
330 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
331 return false;
332
333 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
334 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
335 unsigned int nvectors = 1;
336 for (;;)
337 {
338 scalar_int_mode int_mode;
339 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
340 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
341 {
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type = build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode), 1);
345 tree vector_type
346 = get_vectype_for_scalar_type (vinfo, int_type, count);
347 if (vector_type
348 && VECTOR_MODE_P (TYPE_MODE (vector_type))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
350 GET_MODE_SIZE (base_vector_mode)))
351 {
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
356 vec_perm_builder sel1 (nelts, 2, 3);
357 vec_perm_builder sel2 (nelts, 2, 3);
358 poly_int64 half_nelts = exact_div (nelts, 2);
359 for (unsigned int i = 0; i < 3; ++i)
360 {
361 sel1.quick_push (i);
362 sel1.quick_push (i + nelts);
363 sel2.quick_push (half_nelts + i);
364 sel2.quick_push (half_nelts + i + nelts);
365 }
366 vec_perm_indices indices1 (sel1, 2, nelts);
367 vec_perm_indices indices2 (sel2, 2, nelts);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
370 {
371 if (nvectors_out)
372 *nvectors_out = nvectors;
373 if (vector_type_out)
374 *vector_type_out = vector_type;
375 if (permutes)
376 {
377 permutes[0] = vect_gen_perm_mask_checked (vector_type,
378 indices1);
379 permutes[1] = vect_gen_perm_mask_checked (vector_type,
380 indices2);
381 }
382 return true;
383 }
384 }
385 }
386 if (!multiple_p (elt_bytes, 2, &elt_bytes))
387 return false;
388 nvectors *= 2;
389 }
390 }
391
392 /* Return true if DTA and DTB match. */
393
394 static bool
395 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
396 {
397 return (dta == dtb
398 || ((dta == vect_external_def || dta == vect_constant_def)
399 && (dtb == vect_external_def || dtb == vect_constant_def)));
400 }
401
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
410 value.
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
413 ok return 0. */
414 static int
415 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
416 bool *skip_args,
417 vec<stmt_vec_info> stmts, unsigned stmt_num,
418 vec<slp_oprnd_info> *oprnds_info)
419 {
420 stmt_vec_info stmt_info = stmts[stmt_num];
421 tree oprnd;
422 unsigned int i, number_of_oprnds;
423 enum vect_def_type dt = vect_uninitialized_def;
424 slp_oprnd_info oprnd_info;
425 int first_op_idx = 1;
426 unsigned int commutative_op = -1U;
427 bool first_op_cond = false;
428 bool first = stmt_num == 0;
429
430 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
431 {
432 number_of_oprnds = gimple_call_num_args (stmt);
433 first_op_idx = 3;
434 if (gimple_call_internal_p (stmt))
435 {
436 internal_fn ifn = gimple_call_internal_fn (stmt);
437 commutative_op = first_commutative_argument (ifn);
438
439 /* Masked load, only look at mask. */
440 if (ifn == IFN_MASK_LOAD)
441 {
442 number_of_oprnds = 1;
443 /* Mask operand index. */
444 first_op_idx = 5;
445 }
446 }
447 }
448 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
449 {
450 enum tree_code code = gimple_assign_rhs_code (stmt);
451 number_of_oprnds = gimple_num_ops (stmt) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code == COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
456 {
457 first_op_cond = true;
458 number_of_oprnds++;
459 }
460 else
461 commutative_op = commutative_tree_code (code) ? 0U : -1U;
462 }
463 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
464 number_of_oprnds = gimple_phi_num_args (stmt);
465 else
466 return -1;
467
468 bool swapped = (swap != 0);
469 bool backedge = false;
470 gcc_assert (!swapped || first_op_cond);
471 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
472 for (i = 0; i < number_of_oprnds; i++)
473 {
474 if (first_op_cond)
475 {
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map = maps[swap];
479
480 if (i < 2)
481 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
482 first_op_idx), map[i]);
483 else
484 oprnd = gimple_op (stmt_info->stmt, map[i]);
485 }
486 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
487 {
488 oprnd = gimple_phi_arg_def (stmt, i);
489 backedge = dominated_by_p (CDI_DOMINATORS,
490 gimple_phi_arg_edge (stmt, i)->src,
491 gimple_bb (stmt_info->stmt));
492 }
493 else
494 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
495 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
496 oprnd = TREE_OPERAND (oprnd, 0);
497
498 oprnd_info = (*oprnds_info)[i];
499
500 stmt_vec_info def_stmt_info;
501 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
502 {
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
505 "Build SLP failed: can't analyze def for %T\n",
506 oprnd);
507
508 return -1;
509 }
510
511 if (skip_args[i])
512 {
513 oprnd_info->def_stmts.quick_push (NULL);
514 oprnd_info->ops.quick_push (NULL_TREE);
515 oprnd_info->first_dt = vect_uninitialized_def;
516 continue;
517 }
518
519 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
520 oprnd_info->any_pattern = true;
521
522 oprnd_info->def_stmts.quick_push (def_stmt_info);
523 oprnd_info->ops.quick_push (oprnd);
524
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
529 goals there. */
530 if (backedge
531 && dts[i] == vect_external_def
532 && is_a <bb_vec_info> (vinfo)
533 && TREE_CODE (oprnd) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
535 && !dominated_by_p (CDI_DOMINATORS,
536 as_a <bb_vec_info> (vinfo)->bbs[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
538 {
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd);
543 return -1;
544 }
545
546 if (first)
547 {
548 tree type = TREE_TYPE (oprnd);
549 dt = dts[i];
550 if ((dt == vect_constant_def
551 || dt == vect_external_def)
552 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
553 && (TREE_CODE (type) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
555 type)))
556 {
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd);
561 return -1;
562 }
563
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
568 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
569 && def_stmt_info)
570 dts[i] = dt = vect_reduction_def;
571
572 /* Check the types of the definition. */
573 switch (dt)
574 {
575 case vect_external_def:
576 case vect_constant_def:
577 case vect_internal_def:
578 case vect_reduction_def:
579 case vect_induction_def:
580 case vect_nested_cycle:
581 break;
582
583 default:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Build SLP failed: illegal type of def %T\n",
588 oprnd);
589 return -1;
590 }
591
592 oprnd_info->first_dt = dt;
593 oprnd_info->first_op_type = type;
594 }
595 }
596 if (first)
597 return 0;
598
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i = 0; i < number_of_oprnds;)
601 {
602 if (skip_args[i])
603 {
604 ++i;
605 continue;
606 }
607
608 oprnd_info = (*oprnds_info)[i];
609 dt = dts[i];
610 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
611 oprnd = oprnd_info->ops[stmt_num];
612 tree type = TREE_TYPE (oprnd);
613
614 if (!types_compatible_p (oprnd_info->first_op_type, type))
615 {
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "Build SLP failed: different operand types\n");
619 return 1;
620 }
621
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
628 && !(oprnd_info->first_dt == vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
631 && def_stmt_info
632 && !STMT_VINFO_DATA_REF (def_stmt_info)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
635 || (!STMT_VINFO_DATA_REF (stmt_info)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
637 && ((!def_stmt_info
638 || STMT_VINFO_DATA_REF (def_stmt_info)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
641 != (oprnd_info->first_dt != vect_reduction_def))))
642 {
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i == commutative_op && !swapped
646 && (!is_a <bb_vec_info> (vinfo)
647 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
648 dts[i+1])
649 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
650 || vect_def_types_match
651 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
652 {
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location,
655 "trying swapped operands\n");
656 std::swap (dts[i], dts[i+1]);
657 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
658 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
659 std::swap ((*oprnds_info)[i]->ops[stmt_num],
660 (*oprnds_info)[i+1]->ops[stmt_num]);
661 swapped = true;
662 continue;
663 }
664
665 if (is_a <bb_vec_info> (vinfo)
666 && !oprnd_info->any_pattern)
667 {
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "treating operand as external\n");
673 oprnd_info->first_dt = dt = vect_external_def;
674 }
675 else
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: different types\n");
680 return 1;
681 }
682 }
683
684 /* Make sure to demote the overall operand to external. */
685 if (dt == vect_external_def)
686 oprnd_info->first_dt = vect_external_def;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt == vect_internal_def
691 || dt == vect_reduction_def)
692 && oprnd_info->first_dt == vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
695 && !STMT_VINFO_DATA_REF (def_stmt_info)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
698 {
699 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
700 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
701 }
702
703 ++i;
704 }
705
706 /* Swap operands. */
707 if (swapped)
708 {
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE, vect_location,
711 "swapped operands to match def types in %G",
712 stmt_info->stmt);
713 }
714
715 return 0;
716 }
717
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
721
722 bool
723 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
724 {
725 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
726 if (old_vectype)
727 return useless_type_conversion_p (vectype, old_vectype);
728
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
730 {
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
734 stmt_vec_info member_info = first_info;
735 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
736 if (is_pattern_stmt_p (member_info)
737 && !useless_type_conversion_p (vectype,
738 STMT_VINFO_VECTYPE (member_info)))
739 break;
740
741 if (!member_info)
742 {
743 for (member_info = first_info; member_info;
744 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
745 STMT_VINFO_VECTYPE (member_info) = vectype;
746 return true;
747 }
748 }
749 else if (!is_pattern_stmt_p (stmt_info))
750 {
751 STMT_VINFO_VECTYPE (stmt_info) = vectype;
752 return true;
753 }
754
755 if (dump_enabled_p ())
756 {
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info->stmt);
760 dump_printf_loc (MSG_NOTE, vect_location,
761 " old vector type: %T\n", old_vectype);
762 dump_printf_loc (MSG_NOTE, vect_location,
763 " new vector type: %T\n", vectype);
764 }
765 return false;
766 }
767
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
770
771 static bool
772 compatible_calls_p (gcall *call1, gcall *call2)
773 {
774 unsigned int nargs = gimple_call_num_args (call1);
775 if (nargs != gimple_call_num_args (call2))
776 return false;
777
778 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
779 return false;
780
781 if (gimple_call_internal_p (call1))
782 {
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
784 TREE_TYPE (gimple_call_lhs (call2))))
785 return false;
786 for (unsigned int i = 0; i < nargs; ++i)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
788 TREE_TYPE (gimple_call_arg (call2, i))))
789 return false;
790 }
791 else
792 {
793 if (!operand_equal_p (gimple_call_fn (call1),
794 gimple_call_fn (call2), 0))
795 return false;
796
797 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
798 return false;
799 }
800 return true;
801 }
802
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
809
810 static bool
811 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
812 unsigned int group_size,
813 tree vectype, poly_uint64 *max_nunits)
814 {
815 if (!vectype)
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported data-type in %G\n",
820 stmt_info->stmt);
821 /* Fatal mismatch. */
822 return false;
823 }
824
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
828 unsigned HOST_WIDE_INT const_nunits;
829 if (is_a <bb_vec_info> (vinfo)
830 && (!nunits.is_constant (&const_nunits)
831 || const_nunits > group_size))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
838 return false;
839 }
840
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits, vectype);
843 return true;
844 }
845
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
852
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
859
860 static bool
861 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
862 vec<stmt_vec_info> stmts, unsigned int group_size,
863 poly_uint64 *max_nunits, bool *matches,
864 bool *two_operators, tree *node_vectype)
865 {
866 unsigned int i;
867 stmt_vec_info first_stmt_info = stmts[0];
868 enum tree_code first_stmt_code = ERROR_MARK;
869 enum tree_code alt_stmt_code = ERROR_MARK;
870 enum tree_code rhs_code = ERROR_MARK;
871 enum tree_code first_cond_code = ERROR_MARK;
872 tree lhs;
873 bool need_same_oprnds = false;
874 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
875 optab optab;
876 int icode;
877 machine_mode optab_op2_mode;
878 machine_mode vec_mode;
879 stmt_vec_info first_load = NULL, prev_first_load = NULL;
880 bool first_stmt_load_p = false, load_p = false;
881 bool first_stmt_phi_p = false, phi_p = false;
882
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info;
885 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
886 {
887 gimple *stmt = stmt_info->stmt;
888 swap[i] = 0;
889 matches[i] = false;
890
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
893
894 /* Fail to vectorize statements marked as unvectorizable, throw
895 or are volatile. */
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
897 || stmt_can_throw_internal (cfun, stmt)
898 || gimple_has_volatile_ops (stmt))
899 {
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Build SLP failed: unvectorizable statement %G",
903 stmt);
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a <bb_vec_info> (vinfo) && i != 0)
909 continue;
910 /* Fatal mismatch. */
911 matches[0] = false;
912 return false;
913 }
914
915 lhs = gimple_get_lhs (stmt);
916 if (lhs == NULL_TREE)
917 {
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt);
922 if (is_a <bb_vec_info> (vinfo) && i != 0)
923 continue;
924 /* Fatal mismatch. */
925 matches[0] = false;
926 return false;
927 }
928
929 tree nunits_vectype;
930 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
931 &nunits_vectype, group_size)
932 || (nunits_vectype
933 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
934 nunits_vectype, max_nunits)))
935 {
936 if (is_a <bb_vec_info> (vinfo) && i != 0)
937 continue;
938 /* Fatal mismatch. */
939 matches[0] = false;
940 return false;
941 }
942
943 gcc_assert (vectype);
944
945 gcall *call_stmt = dyn_cast <gcall *> (stmt);
946 if (call_stmt)
947 {
948 rhs_code = CALL_EXPR;
949
950 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
951 load_p = true;
952 else if ((gimple_call_internal_p (call_stmt)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt))))
955 || gimple_call_tail_p (call_stmt)
956 || gimple_call_noreturn_p (call_stmt)
957 || !gimple_call_nothrow_p (call_stmt)
958 || gimple_call_chain (call_stmt))
959 {
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
962 "Build SLP failed: unsupported call type %G",
963 call_stmt);
964 if (is_a <bb_vec_info> (vinfo) && i != 0)
965 continue;
966 /* Fatal mismatch. */
967 matches[0] = false;
968 return false;
969 }
970 }
971 else if (gimple_code (stmt) == GIMPLE_PHI)
972 {
973 rhs_code = ERROR_MARK;
974 phi_p = true;
975 }
976 else
977 {
978 rhs_code = gimple_assign_rhs_code (stmt);
979 load_p = gimple_vuse (stmt);
980 }
981
982 /* Check the operation. */
983 if (i == 0)
984 {
985 *node_vectype = vectype;
986 first_stmt_code = rhs_code;
987 first_stmt_load_p = load_p;
988 first_stmt_phi_p = phi_p;
989
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
993 || rhs_code == LROTATE_EXPR
994 || rhs_code == RROTATE_EXPR)
995 {
996 vec_mode = TYPE_MODE (vectype);
997
998 /* First see if we have a vector/vector shift. */
999 optab = optab_for_tree_code (rhs_code, vectype,
1000 optab_vector);
1001
1002 if (!optab
1003 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1004 {
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab = optab_for_tree_code (rhs_code, vectype,
1007 optab_scalar);
1008
1009 if (!optab)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a <bb_vec_info> (vinfo) && i != 0)
1015 continue;
1016 /* Fatal mismatch. */
1017 matches[0] = false;
1018 return false;
1019 }
1020 icode = (int) optab_handler (optab, vec_mode);
1021 if (icode == CODE_FOR_nothing)
1022 {
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a <bb_vec_info> (vinfo) && i != 0)
1028 continue;
1029 /* Fatal mismatch. */
1030 matches[0] = false;
1031 return false;
1032 }
1033 optab_op2_mode = insn_data[icode].operand[2].mode;
1034 if (!VECTOR_MODE_P (optab_op2_mode))
1035 {
1036 need_same_oprnds = true;
1037 first_op1 = gimple_assign_rhs2 (stmt);
1038 }
1039 }
1040 }
1041 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1042 {
1043 need_same_oprnds = true;
1044 first_op1 = gimple_assign_rhs2 (stmt);
1045 }
1046 else if (!load_p
1047 && rhs_code == BIT_FIELD_REF)
1048 {
1049 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1050 if (TREE_CODE (vec) != SSA_NAME
1051 || !types_compatible_p (vectype, TREE_TYPE (vec)))
1052 {
1053 if (is_a <bb_vec_info> (vinfo) && i != 0)
1054 continue;
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1060 matches[0] = false;
1061 return false;
1062 }
1063 }
1064 else if (call_stmt
1065 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1066 {
1067 need_same_oprnds = true;
1068 first_op1 = gimple_call_arg (call_stmt, 1);
1069 }
1070 }
1071 else
1072 {
1073 if (first_stmt_code != rhs_code
1074 && alt_stmt_code == ERROR_MARK)
1075 alt_stmt_code = rhs_code;
1076 if ((first_stmt_code != rhs_code
1077 && (first_stmt_code != IMAGPART_EXPR
1078 || rhs_code != REALPART_EXPR)
1079 && (first_stmt_code != REALPART_EXPR
1080 || rhs_code != IMAGPART_EXPR)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code == PLUS_EXPR
1084 || first_stmt_code == MINUS_EXPR)
1085 && (alt_stmt_code == PLUS_EXPR
1086 || alt_stmt_code == MINUS_EXPR)
1087 && rhs_code == alt_stmt_code)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1089 && (first_stmt_code == ARRAY_REF
1090 || first_stmt_code == BIT_FIELD_REF
1091 || first_stmt_code == INDIRECT_REF
1092 || first_stmt_code == COMPONENT_REF
1093 || first_stmt_code == MEM_REF)))
1094 || first_stmt_load_p != load_p
1095 || first_stmt_phi_p != phi_p)
1096 {
1097 if (dump_enabled_p ())
1098 {
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "original stmt %G", first_stmt_info->stmt);
1104 }
1105 /* Mismatch. */
1106 continue;
1107 }
1108
1109 if (need_same_oprnds)
1110 {
1111 tree other_op1 = (call_stmt
1112 ? gimple_call_arg (call_stmt, 1)
1113 : gimple_assign_rhs2 (stmt));
1114 if (!operand_equal_p (first_op1, other_op1, 0))
1115 {
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt);
1120 /* Mismatch. */
1121 continue;
1122 }
1123 }
1124 if (!load_p
1125 && first_stmt_code == BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt);
1133 /* Mismatch. */
1134 continue;
1135 }
1136
1137 if (!load_p && rhs_code == CALL_EXPR)
1138 {
1139 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1140 as_a <gcall *> (stmt)))
1141 {
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144 "Build SLP failed: different calls in %G",
1145 stmt);
1146 /* Mismatch. */
1147 continue;
1148 }
1149 }
1150
1151 if (phi_p
1152 && (gimple_bb (first_stmt_info->stmt)
1153 != gimple_bb (stmt_info->stmt)))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Build SLP failed: different BB for PHI "
1158 "in %G", stmt);
1159 /* Mismatch. */
1160 continue;
1161 }
1162
1163 if (!types_compatible_p (vectype, *node_vectype))
1164 {
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167 "Build SLP failed: different vector type "
1168 "in %G", stmt);
1169 /* Mismatch. */
1170 continue;
1171 }
1172 }
1173
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1176 {
1177 if (REFERENCE_CLASS_P (lhs))
1178 {
1179 /* Store. */
1180 ;
1181 }
1182 else
1183 {
1184 /* Load. */
1185 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1186 if (prev_first_load)
1187 {
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load != first_load)
1191 {
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1194 vect_location,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1197 stmt);
1198 /* Mismatch. */
1199 continue;
1200 }
1201 }
1202 else
1203 prev_first_load = first_load;
1204 }
1205 } /* Grouped access. */
1206 else
1207 {
1208 if (load_p)
1209 {
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: not grouped load %G", stmt);
1214
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a <bb_vec_info> (vinfo) && i != 0)
1217 continue;
1218 /* Fatal mismatch. */
1219 matches[0] = false;
1220 return false;
1221 }
1222
1223 /* Not memory operation. */
1224 if (!phi_p
1225 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1229 && rhs_code != VIEW_CONVERT_EXPR
1230 && rhs_code != CALL_EXPR
1231 && rhs_code != BIT_FIELD_REF)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "Build SLP failed: operation unsupported %G",
1236 stmt);
1237 if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 continue;
1239 /* Fatal mismatch. */
1240 matches[0] = false;
1241 return false;
1242 }
1243
1244 if (rhs_code == COND_EXPR)
1245 {
1246 tree cond_expr = gimple_assign_rhs1 (stmt);
1247 enum tree_code cond_code = TREE_CODE (cond_expr);
1248 enum tree_code swap_code = ERROR_MARK;
1249 enum tree_code invert_code = ERROR_MARK;
1250
1251 if (i == 0)
1252 first_cond_code = TREE_CODE (cond_expr);
1253 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1254 {
1255 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1256 swap_code = swap_tree_comparison (cond_code);
1257 invert_code = invert_tree_comparison (cond_code, honor_nans);
1258 }
1259
1260 if (first_cond_code == cond_code)
1261 ;
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code == swap_code)
1264 swap[i] = 1;
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code == invert_code)
1267 swap[i] = 2;
1268 else
1269 {
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272 "Build SLP failed: different"
1273 " operation %G", stmt);
1274 /* Mismatch. */
1275 continue;
1276 }
1277 }
1278 }
1279
1280 matches[i] = true;
1281 }
1282
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 return false;
1286
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code != ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1291 {
1292 *two_operators = true;
1293 }
1294
1295 return true;
1296 }
1297
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1301 struct bst_traits
1302 {
1303 typedef vec <stmt_vec_info> value_type;
1304 typedef vec <stmt_vec_info> compare_type;
1305 static inline hashval_t hash (value_type);
1306 static inline bool equal (value_type existing, value_type candidate);
1307 static inline bool is_empty (value_type x) { return !x.exists (); }
1308 static inline bool is_deleted (value_type x) { return !x.exists (); }
1309 static const bool empty_zero_p = true;
1310 static inline void mark_empty (value_type &x) { x.release (); }
1311 static inline void mark_deleted (value_type &x) { x.release (); }
1312 static inline void remove (value_type &x) { x.release (); }
1313 };
1314 inline hashval_t
1315 bst_traits::hash (value_type x)
1316 {
1317 inchash::hash h;
1318 for (unsigned i = 0; i < x.length (); ++i)
1319 h.add_int (gimple_uid (x[i]->stmt));
1320 return h.end ();
1321 }
1322 inline bool
1323 bst_traits::equal (value_type existing, value_type candidate)
1324 {
1325 if (existing.length () != candidate.length ())
1326 return false;
1327 for (unsigned i = 0; i < existing.length (); ++i)
1328 if (existing[i] != candidate[i])
1329 return false;
1330 return true;
1331 }
1332
1333 typedef hash_map <vec <gimple *>, slp_tree,
1334 simple_hashmap_traits <bst_traits, slp_tree> >
1335 scalar_stmts_to_slp_tree_map_t;
1336
1337 static slp_tree
1338 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1339 vec<stmt_vec_info> stmts, unsigned int group_size,
1340 poly_uint64 *max_nunits,
1341 bool *matches, unsigned *npermutes, unsigned *tree_size,
1342 scalar_stmts_to_slp_tree_map_t *bst_map);
1343
1344 static slp_tree
1345 vect_build_slp_tree (vec_info *vinfo,
1346 vec<stmt_vec_info> stmts, unsigned int group_size,
1347 poly_uint64 *max_nunits,
1348 bool *matches, unsigned *npermutes, unsigned *tree_size,
1349 scalar_stmts_to_slp_tree_map_t *bst_map)
1350 {
1351 if (slp_tree *leader = bst_map->get (stmts))
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1355 *leader ? "" : "failed ", *leader);
1356 if (*leader)
1357 {
1358 SLP_TREE_REF_COUNT (*leader)++;
1359 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1360 }
1361 return *leader;
1362 }
1363
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res = new _slp_tree;
1367 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1368 SLP_TREE_SCALAR_STMTS (res) = stmts;
1369 bst_map->put (stmts.copy (), res);
1370
1371 poly_uint64 this_max_nunits = 1;
1372 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1373 &this_max_nunits,
1374 matches, npermutes, tree_size, bst_map);
1375 if (!res_)
1376 {
1377 bool existed_p = bst_map->put (stmts, NULL);
1378 gcc_assert (existed_p);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1382 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1383 vect_free_slp_tree (res);
1384 }
1385 else
1386 {
1387 gcc_assert (res_ == res);
1388 res->max_nunits = this_max_nunits;
1389 vect_update_max_nunits (max_nunits, this_max_nunits);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res)++;
1392 }
1393 return res_;
1394 }
1395
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1401 was found. */
1402
1403 static slp_tree
1404 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1405 vec<stmt_vec_info> stmts, unsigned int group_size,
1406 poly_uint64 *max_nunits,
1407 bool *matches, unsigned *npermutes, unsigned *tree_size,
1408 scalar_stmts_to_slp_tree_map_t *bst_map)
1409 {
1410 unsigned nops, i, this_tree_size = 0;
1411 poly_uint64 this_max_nunits = *max_nunits;
1412
1413 matches[0] = false;
1414
1415 stmt_vec_info stmt_info = stmts[0];
1416 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1417 nops = gimple_call_num_args (stmt);
1418 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1419 {
1420 nops = gimple_num_ops (stmt) - 1;
1421 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1422 nops++;
1423 }
1424 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1425 nops = gimple_phi_num_args (phi);
1426 else
1427 return NULL;
1428
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1430 the recursion. */
1431 bool *skip_args = XALLOCAVEC (bool, nops);
1432 memset (skip_args, 0, nops);
1433 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1434 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1435 {
1436 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1437 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1438 group_size);
1439 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1440 max_nunits))
1441 return NULL;
1442
1443 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1444 if (def_type == vect_induction_def)
1445 {
1446 /* Induction PHIs are not cycles but walk the initial
1447 value. */
1448 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1449 if (nested_in_vect_loop_p (loop, stmt_info))
1450 loop = loop->inner;
1451 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1452 }
1453 else if (def_type == vect_reduction_def
1454 || def_type == vect_double_reduction_def
1455 || def_type == vect_nested_cycle)
1456 {
1457 /* Else def types have to match. */
1458 stmt_vec_info other_info;
1459 bool all_same = true;
1460 FOR_EACH_VEC_ELT (stmts, i, other_info)
1461 {
1462 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1463 return NULL;
1464 if (other_info != stmt_info)
1465 all_same = false;
1466 }
1467 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1468 /* Reduction initial values are not explicitely represented. */
1469 if (!nested_in_vect_loop_p (loop, stmt_info))
1470 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1471 /* Reduction chain backedge defs are filled manually.
1472 ??? Need a better way to identify a SLP reduction chain PHI.
1473 Or a better overall way to SLP match those. */
1474 if (all_same && def_type == vect_reduction_def)
1475 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1476 }
1477 else if (def_type != vect_internal_def)
1478 return NULL;
1479 }
1480
1481
1482 bool two_operators = false;
1483 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1484 tree vectype = NULL_TREE;
1485 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1486 &this_max_nunits, matches, &two_operators,
1487 &vectype))
1488 return NULL;
1489
1490 /* If the SLP node is a load, terminate the recursion unless masked. */
1491 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1492 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1493 {
1494 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1495 {
1496 /* Masked load. */
1497 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1498 nops = 1;
1499 }
1500 else
1501 {
1502 *max_nunits = this_max_nunits;
1503 (*tree_size)++;
1504 node = vect_create_new_slp_node (node, stmts, 0);
1505 SLP_TREE_VECTYPE (node) = vectype;
1506 /* And compute the load permutation. Whether it is actually
1507 a permutation depends on the unrolling factor which is
1508 decided later. */
1509 vec<unsigned> load_permutation;
1510 int j;
1511 stmt_vec_info load_info;
1512 load_permutation.create (group_size);
1513 stmt_vec_info first_stmt_info
1514 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1515 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1516 {
1517 int load_place = vect_get_place_in_interleaving_chain
1518 (load_info, first_stmt_info);
1519 gcc_assert (load_place != -1);
1520 load_permutation.safe_push (load_place);
1521 }
1522 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1523 return node;
1524 }
1525 }
1526 else if (gimple_assign_single_p (stmt_info->stmt)
1527 && !gimple_vuse (stmt_info->stmt)
1528 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1529 {
1530 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1531 the same SSA name vector of a compatible type to vectype. */
1532 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1533 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1534 stmt_vec_info estmt_info;
1535 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1536 {
1537 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1538 tree bfref = gimple_assign_rhs1 (estmt);
1539 HOST_WIDE_INT lane;
1540 if (!known_eq (bit_field_size (bfref),
1541 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1542 || !constant_multiple_p (bit_field_offset (bfref),
1543 bit_field_size (bfref), &lane))
1544 {
1545 lperm.release ();
1546 return NULL;
1547 }
1548 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1549 }
1550 slp_tree vnode = vect_create_new_slp_node (vNULL);
1551 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1552 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1553 /* We are always building a permutation node even if it is an identity
1554 permute to shield the rest of the vectorizer from the odd node
1555 representing an actual vector without any scalar ops.
1556 ??? We could hide it completely with making the permute node
1557 external? */
1558 node = vect_create_new_slp_node (node, stmts, 1);
1559 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1560 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1561 SLP_TREE_VECTYPE (node) = vectype;
1562 SLP_TREE_CHILDREN (node).quick_push (vnode);
1563 return node;
1564 }
1565
1566 /* Get at the operands, verifying they are compatible. */
1567 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1568 slp_oprnd_info oprnd_info;
1569 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1570 {
1571 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1572 stmts, i, &oprnds_info);
1573 if (res != 0)
1574 matches[(res == -1) ? 0 : i] = false;
1575 if (!matches[0])
1576 break;
1577 }
1578 for (i = 0; i < group_size; ++i)
1579 if (!matches[i])
1580 {
1581 vect_free_oprnd_info (oprnds_info);
1582 return NULL;
1583 }
1584 swap = NULL;
1585
1586 auto_vec<slp_tree, 4> children;
1587
1588 stmt_info = stmts[0];
1589
1590 /* Create SLP_TREE nodes for the definition node/s. */
1591 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1592 {
1593 slp_tree child;
1594 unsigned int j;
1595
1596 /* We're skipping certain operands from processing, for example
1597 outer loop reduction initial defs. */
1598 if (skip_args[i])
1599 {
1600 children.safe_push (NULL);
1601 continue;
1602 }
1603
1604 if (oprnd_info->first_dt == vect_uninitialized_def)
1605 {
1606 /* COND_EXPR have one too many eventually if the condition
1607 is a SSA name. */
1608 gcc_assert (i == 3 && nops == 4);
1609 continue;
1610 }
1611
1612 if (is_a <bb_vec_info> (vinfo)
1613 && oprnd_info->first_dt == vect_internal_def
1614 && !oprnd_info->any_pattern)
1615 {
1616 /* For BB vectorization, if all defs are the same do not
1617 bother to continue the build along the single-lane
1618 graph but use a splat of the scalar value. */
1619 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1620 for (j = 1; j < group_size; ++j)
1621 if (oprnd_info->def_stmts[j] != first_def)
1622 break;
1623 if (j == group_size
1624 /* But avoid doing this for loads where we may be
1625 able to CSE things, unless the stmt is not
1626 vectorizable. */
1627 && (!STMT_VINFO_VECTORIZABLE (first_def)
1628 || !gimple_vuse (first_def->stmt)))
1629 {
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_NOTE, vect_location,
1632 "Using a splat of the uniform operand\n");
1633 oprnd_info->first_dt = vect_external_def;
1634 }
1635 }
1636
1637 if (oprnd_info->first_dt == vect_external_def
1638 || oprnd_info->first_dt == vect_constant_def)
1639 {
1640 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1641 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1642 oprnd_info->ops = vNULL;
1643 children.safe_push (invnode);
1644 continue;
1645 }
1646
1647 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1648 group_size, &this_max_nunits,
1649 matches, npermutes,
1650 &this_tree_size, bst_map)) != NULL)
1651 {
1652 oprnd_info->def_stmts = vNULL;
1653 children.safe_push (child);
1654 continue;
1655 }
1656
1657 /* If the SLP build for operand zero failed and operand zero
1658 and one can be commutated try that for the scalar stmts
1659 that failed the match. */
1660 if (i == 0
1661 /* A first scalar stmt mismatch signals a fatal mismatch. */
1662 && matches[0]
1663 /* ??? For COND_EXPRs we can swap the comparison operands
1664 as well as the arms under some constraints. */
1665 && nops == 2
1666 && oprnds_info[1]->first_dt == vect_internal_def
1667 && is_gimple_assign (stmt_info->stmt)
1668 /* Swapping operands for reductions breaks assumptions later on. */
1669 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1670 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1671 /* Do so only if the number of not successful permutes was nor more
1672 than a cut-ff as re-trying the recursive match on
1673 possibly each level of the tree would expose exponential
1674 behavior. */
1675 && *npermutes < 4)
1676 {
1677 /* See whether we can swap the matching or the non-matching
1678 stmt operands. */
1679 bool swap_not_matching = true;
1680 do
1681 {
1682 for (j = 0; j < group_size; ++j)
1683 {
1684 if (matches[j] != !swap_not_matching)
1685 continue;
1686 stmt_vec_info stmt_info = stmts[j];
1687 /* Verify if we can swap operands of this stmt. */
1688 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1689 if (!stmt
1690 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1691 {
1692 if (!swap_not_matching)
1693 goto fail;
1694 swap_not_matching = false;
1695 break;
1696 }
1697 }
1698 }
1699 while (j != group_size);
1700
1701 /* Swap mismatched definition stmts. */
1702 if (dump_enabled_p ())
1703 dump_printf_loc (MSG_NOTE, vect_location,
1704 "Re-trying with swapped operands of stmts ");
1705 for (j = 0; j < group_size; ++j)
1706 if (matches[j] == !swap_not_matching)
1707 {
1708 std::swap (oprnds_info[0]->def_stmts[j],
1709 oprnds_info[1]->def_stmts[j]);
1710 std::swap (oprnds_info[0]->ops[j],
1711 oprnds_info[1]->ops[j]);
1712 if (dump_enabled_p ())
1713 dump_printf (MSG_NOTE, "%d ", j);
1714 }
1715 if (dump_enabled_p ())
1716 dump_printf (MSG_NOTE, "\n");
1717 /* And try again with scratch 'matches' ... */
1718 bool *tem = XALLOCAVEC (bool, group_size);
1719 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1720 group_size, &this_max_nunits,
1721 tem, npermutes,
1722 &this_tree_size, bst_map)) != NULL)
1723 {
1724 oprnd_info->def_stmts = vNULL;
1725 children.safe_push (child);
1726 continue;
1727 }
1728 /* We do not undo the swapping here since it might still be
1729 the better order for the second operand in case we build
1730 the first one from scalars below. */
1731 ++*npermutes;
1732 }
1733 fail:
1734
1735 /* If the SLP build failed and we analyze a basic-block
1736 simply treat nodes we fail to build as externally defined
1737 (and thus build vectors from the scalar defs).
1738 The cost model will reject outright expensive cases.
1739 ??? This doesn't treat cases where permutation ultimatively
1740 fails (or we don't try permutation below). Ideally we'd
1741 even compute a permutation that will end up with the maximum
1742 SLP tree size... */
1743 if (is_a <bb_vec_info> (vinfo)
1744 /* ??? Rejecting patterns this way doesn't work. We'd have to
1745 do extra work to cancel the pattern so the uses see the
1746 scalar version. */
1747 && !is_pattern_stmt_p (stmt_info)
1748 && !oprnd_info->any_pattern)
1749 {
1750 /* But if there's a leading vector sized set of matching stmts
1751 fail here so we can split the group. This matches the condition
1752 vect_analyze_slp_instance uses. */
1753 /* ??? We might want to split here and combine the results to support
1754 multiple vector sizes better. */
1755 for (j = 0; j < group_size; ++j)
1756 if (!matches[j])
1757 break;
1758 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1759 {
1760 if (dump_enabled_p ())
1761 dump_printf_loc (MSG_NOTE, vect_location,
1762 "Building vector operands from scalars\n");
1763 this_tree_size++;
1764 child = vect_create_new_slp_node (oprnd_info->ops);
1765 children.safe_push (child);
1766 oprnd_info->ops = vNULL;
1767 continue;
1768 }
1769 }
1770
1771 gcc_assert (child == NULL);
1772 FOR_EACH_VEC_ELT (children, j, child)
1773 if (child)
1774 vect_free_slp_tree (child);
1775 vect_free_oprnd_info (oprnds_info);
1776 return NULL;
1777 }
1778
1779 vect_free_oprnd_info (oprnds_info);
1780
1781 /* If we have all children of a child built up from uniform scalars
1782 or does more than one possibly expensive vector construction then
1783 just throw that away, causing it built up from scalars.
1784 The exception is the SLP node for the vector store. */
1785 if (is_a <bb_vec_info> (vinfo)
1786 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1787 /* ??? Rejecting patterns this way doesn't work. We'd have to
1788 do extra work to cancel the pattern so the uses see the
1789 scalar version. */
1790 && !is_pattern_stmt_p (stmt_info))
1791 {
1792 slp_tree child;
1793 unsigned j;
1794 bool all_uniform_p = true;
1795 unsigned n_vector_builds = 0;
1796 FOR_EACH_VEC_ELT (children, j, child)
1797 {
1798 if (!child)
1799 ;
1800 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1801 all_uniform_p = false;
1802 else if (!vect_slp_tree_uniform_p (child))
1803 {
1804 all_uniform_p = false;
1805 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1806 n_vector_builds++;
1807 }
1808 }
1809 if (all_uniform_p || n_vector_builds > 1)
1810 {
1811 /* Roll back. */
1812 matches[0] = false;
1813 FOR_EACH_VEC_ELT (children, j, child)
1814 if (child)
1815 vect_free_slp_tree (child);
1816
1817 if (dump_enabled_p ())
1818 dump_printf_loc (MSG_NOTE, vect_location,
1819 "Building parent vector operands from "
1820 "scalars instead\n");
1821 return NULL;
1822 }
1823 }
1824
1825 *tree_size += this_tree_size + 1;
1826 *max_nunits = this_max_nunits;
1827
1828 if (two_operators)
1829 {
1830 /* ??? We'd likely want to either cache in bst_map sth like
1831 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1832 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1833 explicit stmts to put in so the keying on 'stmts' doesn't
1834 work (but we have the same issue with nodes that use 'ops'). */
1835 slp_tree one = new _slp_tree;
1836 slp_tree two = new _slp_tree;
1837 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1838 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1839 SLP_TREE_VECTYPE (one) = vectype;
1840 SLP_TREE_VECTYPE (two) = vectype;
1841 SLP_TREE_CHILDREN (one).safe_splice (children);
1842 SLP_TREE_CHILDREN (two).safe_splice (children);
1843 slp_tree child;
1844 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1845 SLP_TREE_REF_COUNT (child)++;
1846
1847 /* Here we record the original defs since this
1848 node represents the final lane configuration. */
1849 node = vect_create_new_slp_node (node, stmts, 2);
1850 SLP_TREE_VECTYPE (node) = vectype;
1851 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1852 SLP_TREE_CHILDREN (node).quick_push (one);
1853 SLP_TREE_CHILDREN (node).quick_push (two);
1854 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1855 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1856 enum tree_code ocode = ERROR_MARK;
1857 stmt_vec_info ostmt_info;
1858 unsigned j = 0;
1859 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1860 {
1861 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1862 if (gimple_assign_rhs_code (ostmt) != code0)
1863 {
1864 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1865 ocode = gimple_assign_rhs_code (ostmt);
1866 j = i;
1867 }
1868 else
1869 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1870 }
1871 SLP_TREE_CODE (one) = code0;
1872 SLP_TREE_CODE (two) = ocode;
1873 SLP_TREE_LANES (one) = stmts.length ();
1874 SLP_TREE_LANES (two) = stmts.length ();
1875 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1876 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1877 return node;
1878 }
1879
1880 node = vect_create_new_slp_node (node, stmts, nops);
1881 SLP_TREE_VECTYPE (node) = vectype;
1882 SLP_TREE_CHILDREN (node).splice (children);
1883 return node;
1884 }
1885
1886 /* Dump a single SLP tree NODE. */
1887
1888 static void
1889 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1890 slp_tree node)
1891 {
1892 unsigned i, j;
1893 slp_tree child;
1894 stmt_vec_info stmt_info;
1895 tree op;
1896
1897 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1898 dump_user_location_t user_loc = loc.get_user_location ();
1899 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1900 SLP_TREE_DEF_TYPE (node) == vect_external_def
1901 ? " (external)"
1902 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1903 ? " (constant)"
1904 : ""), node,
1905 estimated_poly_value (node->max_nunits),
1906 SLP_TREE_REF_COUNT (node));
1907 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1908 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1909 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1910 else
1911 {
1912 dump_printf_loc (metadata, user_loc, "\t{ ");
1913 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1914 dump_printf (metadata, "%T%s ", op,
1915 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1916 dump_printf (metadata, "}\n");
1917 }
1918 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1919 {
1920 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1921 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1922 dump_printf (dump_kind, " %u", j);
1923 dump_printf (dump_kind, " }\n");
1924 }
1925 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1926 {
1927 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1928 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1929 dump_printf (dump_kind, " %u[%u]",
1930 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1931 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1932 dump_printf (dump_kind, " }\n");
1933 }
1934 if (SLP_TREE_CHILDREN (node).is_empty ())
1935 return;
1936 dump_printf_loc (metadata, user_loc, "\tchildren");
1937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1938 dump_printf (dump_kind, " %p", (void *)child);
1939 dump_printf (dump_kind, "\n");
1940 }
1941
1942 DEBUG_FUNCTION void
1943 debug (slp_tree node)
1944 {
1945 debug_dump_context ctx;
1946 vect_print_slp_tree (MSG_NOTE,
1947 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1948 node);
1949 }
1950
1951 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1952
1953 static void
1954 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1955 slp_tree node, hash_set<slp_tree> &visited)
1956 {
1957 unsigned i;
1958 slp_tree child;
1959
1960 if (visited.add (node))
1961 return;
1962
1963 vect_print_slp_tree (dump_kind, loc, node);
1964
1965 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1966 if (child)
1967 vect_print_slp_graph (dump_kind, loc, child, visited);
1968 }
1969
1970 static void
1971 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1972 slp_tree entry)
1973 {
1974 hash_set<slp_tree> visited;
1975 vect_print_slp_graph (dump_kind, loc, entry, visited);
1976 }
1977
1978 /* Mark the tree rooted at NODE with PURE_SLP. */
1979
1980 static void
1981 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1982 {
1983 int i;
1984 stmt_vec_info stmt_info;
1985 slp_tree child;
1986
1987 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1988 return;
1989
1990 if (visited.add (node))
1991 return;
1992
1993 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1994 STMT_SLP_TYPE (stmt_info) = pure_slp;
1995
1996 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1997 if (child)
1998 vect_mark_slp_stmts (child, visited);
1999 }
2000
2001 static void
2002 vect_mark_slp_stmts (slp_tree node)
2003 {
2004 hash_set<slp_tree> visited;
2005 vect_mark_slp_stmts (node, visited);
2006 }
2007
2008 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2009
2010 static void
2011 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2012 {
2013 int i;
2014 stmt_vec_info stmt_info;
2015 slp_tree child;
2016
2017 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2018 return;
2019
2020 if (visited.add (node))
2021 return;
2022
2023 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2024 {
2025 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2026 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2027 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2028 }
2029
2030 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2031 if (child)
2032 vect_mark_slp_stmts_relevant (child, visited);
2033 }
2034
2035 static void
2036 vect_mark_slp_stmts_relevant (slp_tree node)
2037 {
2038 hash_set<slp_tree> visited;
2039 vect_mark_slp_stmts_relevant (node, visited);
2040 }
2041
2042
2043 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2044
2045 static void
2046 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2047 hash_set<slp_tree> &visited)
2048 {
2049 if (!node || visited.add (node))
2050 return;
2051
2052 if (SLP_TREE_CHILDREN (node).length () == 0)
2053 {
2054 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2055 return;
2056 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2057 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2058 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2059 loads.safe_push (node);
2060 }
2061 else
2062 {
2063 unsigned i;
2064 slp_tree child;
2065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2066 vect_gather_slp_loads (loads, child, visited);
2067 }
2068 }
2069
2070 static void
2071 vect_gather_slp_loads (slp_instance inst, slp_tree node)
2072 {
2073 hash_set<slp_tree> visited;
2074 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2075 }
2076
2077
2078 /* Find the last store in SLP INSTANCE. */
2079
2080 stmt_vec_info
2081 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2082 {
2083 stmt_vec_info last = NULL;
2084 stmt_vec_info stmt_vinfo;
2085
2086 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2087 {
2088 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2089 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2090 }
2091
2092 return last;
2093 }
2094
2095 /* Find the first stmt in NODE. */
2096
2097 stmt_vec_info
2098 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2099 {
2100 stmt_vec_info first = NULL;
2101 stmt_vec_info stmt_vinfo;
2102
2103 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2104 {
2105 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2106 if (!first
2107 || get_later_stmt (stmt_vinfo, first) == first)
2108 first = stmt_vinfo;
2109 }
2110
2111 return first;
2112 }
2113
2114 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2115 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2116 (also containing the first GROUP1_SIZE stmts, since stores are
2117 consecutive), the second containing the remainder.
2118 Return the first stmt in the second group. */
2119
2120 static stmt_vec_info
2121 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2122 {
2123 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2124 gcc_assert (group1_size > 0);
2125 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2126 gcc_assert (group2_size > 0);
2127 DR_GROUP_SIZE (first_vinfo) = group1_size;
2128
2129 stmt_vec_info stmt_info = first_vinfo;
2130 for (unsigned i = group1_size; i > 1; i--)
2131 {
2132 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2133 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2134 }
2135 /* STMT is now the last element of the first group. */
2136 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2137 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2138
2139 DR_GROUP_SIZE (group2) = group2_size;
2140 for (stmt_info = group2; stmt_info;
2141 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2142 {
2143 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2144 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2145 }
2146
2147 /* For the second group, the DR_GROUP_GAP is that before the original group,
2148 plus skipping over the first vector. */
2149 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2150
2151 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2152 DR_GROUP_GAP (first_vinfo) += group2_size;
2153
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2156 group1_size, group2_size);
2157
2158 return group2;
2159 }
2160
2161 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2162 statements and a vector of NUNITS elements. */
2163
2164 static poly_uint64
2165 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2166 {
2167 return exact_div (common_multiple (nunits, group_size), group_size);
2168 }
2169
2170 enum slp_instance_kind {
2171 slp_inst_kind_store,
2172 slp_inst_kind_reduc_group,
2173 slp_inst_kind_reduc_chain,
2174 slp_inst_kind_ctor
2175 };
2176
2177 static bool
2178 vect_analyze_slp_instance (vec_info *vinfo,
2179 scalar_stmts_to_slp_tree_map_t *bst_map,
2180 stmt_vec_info stmt_info, unsigned max_tree_size);
2181
2182 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2183 of KIND. Return true if successful. */
2184
2185 static bool
2186 vect_build_slp_instance (vec_info *vinfo,
2187 slp_instance_kind kind,
2188 vec<stmt_vec_info> scalar_stmts,
2189 stmt_vec_info root_stmt_info,
2190 unsigned max_tree_size,
2191 scalar_stmts_to_slp_tree_map_t *bst_map,
2192 /* ??? We need stmt_info for group splitting. */
2193 stmt_vec_info stmt_info_)
2194 {
2195 if (dump_enabled_p ())
2196 {
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "Starting SLP discovery for\n");
2199 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 " %G", scalar_stmts[i]->stmt);
2202 }
2203
2204 /* Build the tree for the SLP instance. */
2205 unsigned int group_size = scalar_stmts.length ();
2206 bool *matches = XALLOCAVEC (bool, group_size);
2207 unsigned npermutes = 0;
2208 poly_uint64 max_nunits = 1;
2209 unsigned tree_size = 0;
2210 unsigned i;
2211 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2212 &max_nunits, matches, &npermutes,
2213 &tree_size, bst_map);
2214 if (node != NULL)
2215 {
2216 /* Calculate the unrolling factor based on the smallest type. */
2217 poly_uint64 unrolling_factor
2218 = calculate_unrolling_factor (max_nunits, group_size);
2219
2220 if (maybe_ne (unrolling_factor, 1U)
2221 && is_a <bb_vec_info> (vinfo))
2222 {
2223 unsigned HOST_WIDE_INT const_max_nunits;
2224 if (!max_nunits.is_constant (&const_max_nunits)
2225 || const_max_nunits > group_size)
2226 {
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "Build SLP failed: store group "
2230 "size not a multiple of the vector size "
2231 "in basic block SLP\n");
2232 vect_free_slp_tree (node);
2233 return false;
2234 }
2235 /* Fatal mismatch. */
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_NOTE, vect_location,
2238 "SLP discovery succeeded but node needs "
2239 "splitting\n");
2240 memset (matches, true, group_size);
2241 matches[group_size / const_max_nunits * const_max_nunits] = false;
2242 vect_free_slp_tree (node);
2243 }
2244 else
2245 {
2246 /* Create a new SLP instance. */
2247 slp_instance new_instance = XNEW (class _slp_instance);
2248 SLP_INSTANCE_TREE (new_instance) = node;
2249 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2250 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2251 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2252 new_instance->reduc_phis = NULL;
2253 new_instance->cost_vec = vNULL;
2254 new_instance->subgraph_entries = vNULL;
2255
2256 vect_gather_slp_loads (new_instance, node);
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_NOTE, vect_location,
2259 "SLP size %u vs. limit %u.\n",
2260 tree_size, max_tree_size);
2261
2262 /* Check whether any load is possibly permuted. */
2263 slp_tree load_node;
2264 bool loads_permuted = false;
2265 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2266 {
2267 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2268 continue;
2269 unsigned j;
2270 stmt_vec_info load_info;
2271 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2272 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2273 {
2274 loads_permuted = true;
2275 break;
2276 }
2277 }
2278
2279 /* If the loads and stores can be handled with load/store-lane
2280 instructions do not generate this SLP instance. */
2281 if (is_a <loop_vec_info> (vinfo)
2282 && loads_permuted
2283 && kind == slp_inst_kind_store
2284 && vect_store_lanes_supported
2285 (STMT_VINFO_VECTYPE (scalar_stmts[0]), group_size, false))
2286 {
2287 slp_tree load_node;
2288 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2289 {
2290 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2291 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2292 /* Use SLP for strided accesses (or if we can't load-lanes). */
2293 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2294 || ! vect_load_lanes_supported
2295 (STMT_VINFO_VECTYPE (stmt_vinfo),
2296 DR_GROUP_SIZE (stmt_vinfo), false))
2297 break;
2298 }
2299 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2300 {
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "Built SLP cancelled: can use "
2304 "load/store-lanes\n");
2305 vect_free_slp_instance (new_instance);
2306 return false;
2307 }
2308 }
2309
2310 /* Fixup SLP reduction chains. */
2311 if (kind == slp_inst_kind_reduc_chain)
2312 {
2313 /* If this is a reduction chain with a conversion in front
2314 amend the SLP tree with a node for that. */
2315 gimple *scalar_def
2316 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2317 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2318 {
2319 /* Get at the conversion stmt - we know it's the single use
2320 of the last stmt of the reduction chain. */
2321 use_operand_p use_p;
2322 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2323 &use_p, &scalar_def);
2324 gcc_assert (r);
2325 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2326 next_info = vect_stmt_to_vectorize (next_info);
2327 scalar_stmts = vNULL;
2328 scalar_stmts.create (group_size);
2329 for (unsigned i = 0; i < group_size; ++i)
2330 scalar_stmts.quick_push (next_info);
2331 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2332 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2333 SLP_TREE_CHILDREN (conv).quick_push (node);
2334 SLP_INSTANCE_TREE (new_instance) = conv;
2335 /* We also have to fake this conversion stmt as SLP reduction
2336 group so we don't have to mess with too much code
2337 elsewhere. */
2338 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2339 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2340 }
2341 /* Fill the backedge child of the PHI SLP node. The
2342 general matching code cannot find it because the
2343 scalar code does not reflect how we vectorize the
2344 reduction. */
2345 use_operand_p use_p;
2346 imm_use_iterator imm_iter;
2347 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2348 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2349 gimple_get_lhs (scalar_def))
2350 /* There are exactly two non-debug uses, the reduction
2351 PHI and the loop-closed PHI node. */
2352 if (!is_gimple_debug (USE_STMT (use_p))
2353 && gimple_bb (USE_STMT (use_p)) == loop->header)
2354 {
2355 auto_vec<stmt_vec_info, 64> phis (group_size);
2356 stmt_vec_info phi_info
2357 = vinfo->lookup_stmt (USE_STMT (use_p));
2358 for (unsigned i = 0; i < group_size; ++i)
2359 phis.quick_push (phi_info);
2360 slp_tree *phi_node = bst_map->get (phis);
2361 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2362 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2363 = SLP_INSTANCE_TREE (new_instance);
2364 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2365 }
2366 }
2367
2368 vinfo->slp_instances.safe_push (new_instance);
2369
2370 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2371 the number of scalar stmts in the root in a few places.
2372 Verify that assumption holds. */
2373 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2374 .length () == group_size);
2375
2376 if (dump_enabled_p ())
2377 {
2378 dump_printf_loc (MSG_NOTE, vect_location,
2379 "Final SLP tree for instance %p:\n", new_instance);
2380 vect_print_slp_graph (MSG_NOTE, vect_location,
2381 SLP_INSTANCE_TREE (new_instance));
2382 }
2383
2384 return true;
2385 }
2386 }
2387 else
2388 {
2389 /* Failed to SLP. */
2390 /* Free the allocated memory. */
2391 scalar_stmts.release ();
2392 }
2393
2394 stmt_vec_info stmt_info = stmt_info_;
2395 /* Try to break the group up into pieces. */
2396 if (kind == slp_inst_kind_store)
2397 {
2398 /* ??? We could delay all the actual splitting of store-groups
2399 until after SLP discovery of the original group completed.
2400 Then we can recurse to vect_build_slp_instance directly. */
2401 for (i = 0; i < group_size; i++)
2402 if (!matches[i])
2403 break;
2404
2405 /* For basic block SLP, try to break the group up into multiples of
2406 a vector size. */
2407 if (is_a <bb_vec_info> (vinfo)
2408 && (i > 1 && i < group_size))
2409 {
2410 tree scalar_type
2411 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2412 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2413 1 << floor_log2 (i));
2414 unsigned HOST_WIDE_INT const_nunits;
2415 if (vectype
2416 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2417 {
2418 /* Split into two groups at the first vector boundary. */
2419 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2420 unsigned group1_size = i & ~(const_nunits - 1);
2421
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_NOTE, vect_location,
2424 "Splitting SLP group at stmt %u\n", i);
2425 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2426 group1_size);
2427 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2428 max_tree_size);
2429 /* Split the rest at the failure point and possibly
2430 re-analyze the remaining matching part if it has
2431 at least two lanes. */
2432 if (group1_size < i
2433 && (i + 1 < group_size
2434 || i - group1_size > 1))
2435 {
2436 stmt_vec_info rest2 = rest;
2437 rest = vect_split_slp_store_group (rest, i - group1_size);
2438 if (i - group1_size > 1)
2439 res |= vect_analyze_slp_instance (vinfo, bst_map,
2440 rest2, max_tree_size);
2441 }
2442 /* Re-analyze the non-matching tail if it has at least
2443 two lanes. */
2444 if (i + 1 < group_size)
2445 res |= vect_analyze_slp_instance (vinfo, bst_map,
2446 rest, max_tree_size);
2447 return res;
2448 }
2449 }
2450
2451 /* For loop vectorization split into arbitrary pieces of size > 1. */
2452 if (is_a <loop_vec_info> (vinfo)
2453 && (i > 1 && i < group_size))
2454 {
2455 unsigned group1_size = i;
2456
2457 if (dump_enabled_p ())
2458 dump_printf_loc (MSG_NOTE, vect_location,
2459 "Splitting SLP group at stmt %u\n", i);
2460
2461 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2462 group1_size);
2463 /* Loop vectorization cannot handle gaps in stores, make sure
2464 the split group appears as strided. */
2465 STMT_VINFO_STRIDED_P (rest) = 1;
2466 DR_GROUP_GAP (rest) = 0;
2467 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2468 DR_GROUP_GAP (stmt_info) = 0;
2469
2470 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2471 max_tree_size);
2472 if (i + 1 < group_size)
2473 res |= vect_analyze_slp_instance (vinfo, bst_map,
2474 rest, max_tree_size);
2475
2476 return res;
2477 }
2478
2479 /* Even though the first vector did not all match, we might be able to SLP
2480 (some) of the remainder. FORNOW ignore this possibility. */
2481 }
2482
2483 /* Failed to SLP. */
2484 if (dump_enabled_p ())
2485 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2486 return false;
2487 }
2488
2489
2490 /* Analyze an SLP instance starting from a group of grouped stores. Call
2491 vect_build_slp_tree to build a tree of packed stmts if possible.
2492 Return FALSE if it's impossible to SLP any stmt in the loop. */
2493
2494 static bool
2495 vect_analyze_slp_instance (vec_info *vinfo,
2496 scalar_stmts_to_slp_tree_map_t *bst_map,
2497 stmt_vec_info stmt_info, unsigned max_tree_size)
2498 {
2499 unsigned int group_size;
2500 unsigned int i;
2501 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2502 vec<stmt_vec_info> scalar_stmts;
2503 slp_instance_kind kind;
2504
2505 if (is_a <bb_vec_info> (vinfo))
2506 vect_location = stmt_info->stmt;
2507 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2508 {
2509 kind = slp_inst_kind_store;
2510 group_size = DR_GROUP_SIZE (stmt_info);
2511 }
2512 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2513 {
2514 kind = slp_inst_kind_reduc_chain;
2515 gcc_assert (is_a <loop_vec_info> (vinfo));
2516 group_size = REDUC_GROUP_SIZE (stmt_info);
2517 }
2518 else if (is_gimple_assign (stmt_info->stmt)
2519 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2520 {
2521 kind = slp_inst_kind_ctor;
2522 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2523 }
2524 else
2525 {
2526 kind = slp_inst_kind_reduc_group;
2527 gcc_assert (is_a <loop_vec_info> (vinfo));
2528 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2529 }
2530
2531 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2532 scalar_stmts.create (group_size);
2533 stmt_vec_info next_info = stmt_info;
2534 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2535 {
2536 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2537 while (next_info)
2538 {
2539 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2540 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2541 }
2542 }
2543 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2544 {
2545 /* Collect the reduction stmts and store them in
2546 SLP_TREE_SCALAR_STMTS. */
2547 while (next_info)
2548 {
2549 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2550 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2551 }
2552 /* Mark the first element of the reduction chain as reduction to properly
2553 transform the node. In the reduction analysis phase only the last
2554 element of the chain is marked as reduction. */
2555 STMT_VINFO_DEF_TYPE (stmt_info)
2556 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2557 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2558 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2559 }
2560 else if (kind == slp_inst_kind_ctor)
2561 {
2562 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2563 tree val;
2564 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2565 {
2566 if (TREE_CODE (val) == SSA_NAME)
2567 {
2568 gimple* def = SSA_NAME_DEF_STMT (val);
2569 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2570 /* Value is defined in another basic block. */
2571 if (!def_info)
2572 return false;
2573 def_info = vect_stmt_to_vectorize (def_info);
2574 scalar_stmts.safe_push (def_info);
2575 }
2576 else
2577 return false;
2578 }
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "Analyzing vectorizable constructor: %G\n",
2582 stmt_info->stmt);
2583 }
2584 else
2585 {
2586 /* Collect reduction statements. */
2587 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2588 for (i = 0; reductions.iterate (i, &next_info); i++)
2589 if (STMT_VINFO_RELEVANT_P (next_info)
2590 || STMT_VINFO_LIVE_P (next_info))
2591 scalar_stmts.quick_push (next_info);
2592 }
2593
2594 /* Build the tree for the SLP instance. */
2595 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2596 kind == slp_inst_kind_ctor
2597 ? stmt_info : NULL,
2598 max_tree_size,
2599 bst_map, stmt_info);
2600
2601 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2602 where we should do store group splitting. */
2603
2604 return res;
2605 }
2606
2607 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2608 trees of packed scalar stmts if SLP is possible. */
2609
2610 opt_result
2611 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2612 {
2613 unsigned int i;
2614 stmt_vec_info first_element;
2615
2616 DUMP_VECT_SCOPE ("vect_analyze_slp");
2617
2618 scalar_stmts_to_slp_tree_map_t *bst_map
2619 = new scalar_stmts_to_slp_tree_map_t ();
2620
2621 /* Find SLP sequences starting from groups of grouped stores. */
2622 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2623 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2624
2625 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2626 {
2627 /* Find SLP sequences starting from reduction chains. */
2628 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2629 if (! STMT_VINFO_RELEVANT_P (first_element)
2630 && ! STMT_VINFO_LIVE_P (first_element))
2631 ;
2632 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2633 max_tree_size))
2634 {
2635 /* Dissolve reduction chain group. */
2636 stmt_vec_info vinfo = first_element;
2637 stmt_vec_info last = NULL;
2638 while (vinfo)
2639 {
2640 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2641 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2642 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2643 last = vinfo;
2644 vinfo = next;
2645 }
2646 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2647 /* It can be still vectorized as part of an SLP reduction. */
2648 loop_vinfo->reductions.safe_push (last);
2649 }
2650
2651 /* Find SLP sequences starting from groups of reductions. */
2652 if (loop_vinfo->reductions.length () > 1)
2653 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2654 max_tree_size);
2655 }
2656
2657 /* The map keeps a reference on SLP nodes built, release that. */
2658 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2659 it != bst_map->end (); ++it)
2660 if ((*it).second)
2661 vect_free_slp_tree ((*it).second);
2662 delete bst_map;
2663
2664 return opt_result::success ();
2665 }
2666
2667 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2668
2669 static void
2670 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2671 vec<slp_tree> &vertices, vec<int> &leafs)
2672 {
2673 unsigned i;
2674 slp_tree child;
2675
2676 if (visited.add (node))
2677 return;
2678
2679 node->vertex = vertices.length ();
2680 vertices.safe_push (node);
2681
2682 bool leaf = true;
2683 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2684 if (child)
2685 {
2686 leaf = false;
2687 vect_slp_build_vertices (visited, child, vertices, leafs);
2688 }
2689 if (leaf)
2690 leafs.safe_push (node->vertex);
2691 }
2692
2693 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2694
2695 static void
2696 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2697 vec<int> &leafs)
2698 {
2699 hash_set<slp_tree> visited;
2700 unsigned i;
2701 slp_instance instance;
2702 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2703 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2704 leafs);
2705 }
2706
2707 /* Apply (reverse) bijectite PERM to VEC. */
2708
2709 template <class T>
2710 static void
2711 vect_slp_permute (vec<unsigned> perm,
2712 vec<T> &vec, bool reverse)
2713 {
2714 auto_vec<T, 64> saved;
2715 saved.create (vec.length ());
2716 for (unsigned i = 0; i < vec.length (); ++i)
2717 saved.quick_push (vec[i]);
2718
2719 if (reverse)
2720 {
2721 for (unsigned i = 0; i < vec.length (); ++i)
2722 vec[perm[i]] = saved[i];
2723 for (unsigned i = 0; i < vec.length (); ++i)
2724 gcc_assert (vec[perm[i]] == saved[i]);
2725 }
2726 else
2727 {
2728 for (unsigned i = 0; i < vec.length (); ++i)
2729 vec[i] = saved[perm[i]];
2730 for (unsigned i = 0; i < vec.length (); ++i)
2731 gcc_assert (vec[i] == saved[perm[i]]);
2732 }
2733 }
2734
2735 /* Return whether permutations PERM_A and PERM_B as recorded in the
2736 PERMS vector are equal. */
2737
2738 static bool
2739 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2740 int perm_a, int perm_b)
2741 {
2742 return (perm_a == perm_b
2743 || (perms[perm_a].length () == perms[perm_b].length ()
2744 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2745 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2746 }
2747
2748 /* Optimize the SLP graph of VINFO. */
2749
2750 void
2751 vect_optimize_slp (vec_info *vinfo)
2752 {
2753 if (vinfo->slp_instances.is_empty ())
2754 return;
2755
2756 slp_tree node;
2757 unsigned i;
2758 auto_vec<slp_tree> vertices;
2759 auto_vec<int> leafs;
2760 vect_slp_build_vertices (vinfo, vertices, leafs);
2761
2762 struct graph *slpg = new_graph (vertices.length ());
2763 FOR_EACH_VEC_ELT (vertices, i, node)
2764 {
2765 unsigned j;
2766 slp_tree child;
2767 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2768 if (child)
2769 add_edge (slpg, i, child->vertex);
2770 }
2771
2772 /* Compute (reverse) postorder on the inverted graph. */
2773 auto_vec<int> ipo;
2774 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
2775
2776 auto_sbitmap n_visited (vertices.length ());
2777 auto_sbitmap n_materialize (vertices.length ());
2778 auto_vec<int> n_perm (vertices.length ());
2779 auto_vec<vec<unsigned> > perms;
2780
2781 bitmap_clear (n_visited);
2782 bitmap_clear (n_materialize);
2783 n_perm.quick_grow_cleared (vertices.length ());
2784 perms.safe_push (vNULL); /* zero is no permute */
2785
2786 /* Produce initial permutations. */
2787 for (i = 0; i < leafs.length (); ++i)
2788 {
2789 int idx = leafs[i];
2790 slp_tree node = vertices[idx];
2791
2792 /* Handle externals and constants optimistically throughout the
2793 iteration. */
2794 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
2795 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
2796 continue;
2797
2798 /* Loads are the only thing generating permutes and leafs do not
2799 change across iterations. */
2800 bitmap_set_bit (n_visited, idx);
2801 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2802 continue;
2803
2804 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2805 node unpermuted, record this permute. */
2806 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
2807 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
2808 continue;
2809 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
2810 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
2811 bool any_permute = false;
2812 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2813 {
2814 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
2815 imin = MIN (imin, idx);
2816 imax = MAX (imax, idx);
2817 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
2818 any_permute = true;
2819 }
2820 /* If there's no permute no need to split one out. */
2821 if (!any_permute)
2822 continue;
2823 /* If the span doesn't match we'd disrupt VF computation, avoid
2824 that for now. */
2825 if (imax - imin + 1 != SLP_TREE_LANES (node))
2826 continue;
2827
2828 /* For now only handle true permutes, like
2829 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2830 when permuting constants and invariants keeping the permute
2831 bijective. */
2832 auto_sbitmap load_index (SLP_TREE_LANES (node));
2833 bitmap_clear (load_index);
2834 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2835 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
2836 unsigned j;
2837 for (j = 0; j < SLP_TREE_LANES (node); ++j)
2838 if (!bitmap_bit_p (load_index, j))
2839 break;
2840 if (j != SLP_TREE_LANES (node))
2841 continue;
2842
2843 vec<unsigned> perm = vNULL;
2844 perm.safe_grow (SLP_TREE_LANES (node), true);
2845 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
2846 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
2847 perms.safe_push (perm);
2848 n_perm[idx] = perms.length () - 1;
2849 }
2850
2851 /* Propagate permutes along the graph and compute materialization points. */
2852 bool changed;
2853 unsigned iteration = 0;
2854 do
2855 {
2856 changed = false;
2857 ++iteration;
2858
2859 for (i = vertices.length (); i > 0 ; --i)
2860 {
2861 int idx = ipo[i-1];
2862 slp_tree node = vertices[idx];
2863 /* For leafs there's nothing to do - we've seeded permutes
2864 on those above. */
2865 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2866 continue;
2867
2868 bitmap_set_bit (n_visited, idx);
2869
2870 /* We cannot move a permute across a store. */
2871 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2872 && DR_IS_WRITE
2873 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
2874 continue;
2875
2876 int perm = -1;
2877 for (graph_edge *succ = slpg->vertices[idx].succ;
2878 succ; succ = succ->succ_next)
2879 {
2880 int succ_idx = succ->dest;
2881 /* Handle unvisited nodes optimistically. */
2882 /* ??? But for constants once we want to handle non-bijective
2883 permutes we have to verify the permute, when unifying lanes,
2884 will not unify different constants. For example see
2885 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2886 if (!bitmap_bit_p (n_visited, succ_idx))
2887 continue;
2888 int succ_perm = n_perm[succ_idx];
2889 /* Once we materialize succs permutation its output lanes
2890 appear unpermuted to us. */
2891 if (bitmap_bit_p (n_materialize, succ_idx))
2892 succ_perm = 0;
2893 if (perm == -1)
2894 perm = succ_perm;
2895 else if (succ_perm == 0)
2896 {
2897 perm = 0;
2898 break;
2899 }
2900 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
2901 {
2902 perm = 0;
2903 break;
2904 }
2905 }
2906
2907 if (perm == -1)
2908 /* Pick up pre-computed leaf values. */
2909 perm = n_perm[idx];
2910 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
2911 {
2912 if (iteration > 1)
2913 /* Make sure we eventually converge. */
2914 gcc_checking_assert (perm == 0);
2915 n_perm[idx] = perm;
2916 if (perm == 0)
2917 bitmap_clear_bit (n_materialize, idx);
2918 changed = true;
2919 }
2920
2921 if (perm == 0)
2922 continue;
2923
2924 /* Elide pruning at materialization points in the first
2925 iteration so every node was visited once at least. */
2926 if (iteration == 1)
2927 continue;
2928
2929 /* Decide on permute materialization. Look whether there's
2930 a use (pred) edge that is permuted differently than us.
2931 In that case mark ourselves so the permutation is applied. */
2932 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
2933 for (graph_edge *pred = slpg->vertices[idx].pred;
2934 pred; pred = pred->pred_next)
2935 {
2936 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
2937 int pred_perm = n_perm[pred->src];
2938 if (!vect_slp_perms_eq (perms, perm, pred_perm))
2939 {
2940 all_preds_permuted = false;
2941 break;
2942 }
2943 }
2944 if (!all_preds_permuted)
2945 {
2946 if (!bitmap_bit_p (n_materialize, idx))
2947 changed = true;
2948 bitmap_set_bit (n_materialize, idx);
2949 }
2950 }
2951 }
2952 while (changed || iteration == 1);
2953
2954 /* Materialize. */
2955 for (i = 0; i < vertices.length (); ++i)
2956 {
2957 int perm = n_perm[i];
2958 if (perm <= 0)
2959 continue;
2960
2961 slp_tree node = vertices[i];
2962
2963 /* First permute invariant/external original successors. */
2964 unsigned j;
2965 slp_tree child;
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2967 {
2968 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2969 continue;
2970
2971 /* If the vector is uniform there's nothing to do. */
2972 if (vect_slp_tree_uniform_p (child))
2973 continue;
2974
2975 /* We can end up sharing some externals via two_operator
2976 handling. Be prepared to unshare those. */
2977 if (child->refcnt != 1)
2978 {
2979 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
2980 SLP_TREE_CHILDREN (node)[j] = child
2981 = vect_create_new_slp_node
2982 (SLP_TREE_SCALAR_OPS (child).copy ());
2983 }
2984 vect_slp_permute (perms[perm],
2985 SLP_TREE_SCALAR_OPS (child), true);
2986 }
2987
2988 if (bitmap_bit_p (n_materialize, i))
2989 {
2990 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2991 /* For loads simply drop the permutation, the load permutation
2992 already performs the desired permutation. */
2993 ;
2994 else
2995 {
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_NOTE, vect_location,
2998 "inserting permute node in place of %p\n",
2999 node);
3000
3001 /* Make a copy of NODE and in-place change it to a
3002 VEC_PERM node to permute the lanes of the copy. */
3003 slp_tree copy = new _slp_tree;
3004 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3005 SLP_TREE_CHILDREN (node) = vNULL;
3006 SLP_TREE_SCALAR_STMTS (copy)
3007 = SLP_TREE_SCALAR_STMTS (node).copy ();
3008 vect_slp_permute (perms[perm],
3009 SLP_TREE_SCALAR_STMTS (copy), true);
3010 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3011 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3012 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3013 SLP_TREE_LANE_PERMUTATION (copy)
3014 = SLP_TREE_LANE_PERMUTATION (node);
3015 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3016 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3017 copy->refcnt = 1;
3018 copy->max_nunits = node->max_nunits;
3019 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3020 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3021 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3022
3023 /* Now turn NODE into a VEC_PERM. */
3024 SLP_TREE_CHILDREN (node).safe_push (copy);
3025 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3026 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3027 SLP_TREE_LANE_PERMUTATION (node)
3028 .quick_push (std::make_pair (0, perms[perm][j]));
3029 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3030 }
3031 }
3032 else
3033 {
3034 /* Apply the reverse permutation to our stmts. */
3035 vect_slp_permute (perms[perm],
3036 SLP_TREE_SCALAR_STMTS (node), true);
3037 /* And to the load permutation, which we can simply
3038 make regular by design. */
3039 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3040 {
3041 /* ??? When we handle non-bijective permutes the idea
3042 is that we can force the load-permutation to be
3043 { min, min + 1, min + 2, ... max }. But then the
3044 scalar defs might no longer match the lane content
3045 which means wrong-code with live lane vectorization.
3046 So we possibly have to have NULL entries for those. */
3047 vect_slp_permute (perms[perm],
3048 SLP_TREE_LOAD_PERMUTATION (node), true);
3049 }
3050 }
3051 }
3052
3053 /* Free the perms vector used for propagation. */
3054 while (!perms.is_empty ())
3055 perms.pop ().release ();
3056 free_graph (slpg);
3057
3058
3059 /* Now elide load permutations that are not necessary. */
3060 for (i = 0; i < leafs.length (); ++i)
3061 {
3062 node = vertices[leafs[i]];
3063 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3064 continue;
3065
3066 /* In basic block vectorization we allow any subchain of an interleaving
3067 chain.
3068 FORNOW: not in loop SLP because of realignment complications. */
3069 if (is_a <bb_vec_info> (vinfo))
3070 {
3071 bool subchain_p = true;
3072 stmt_vec_info next_load_info = NULL;
3073 stmt_vec_info load_info;
3074 unsigned j;
3075 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3076 {
3077 if (j != 0
3078 && (next_load_info != load_info
3079 || DR_GROUP_GAP (load_info) != 1))
3080 {
3081 subchain_p = false;
3082 break;
3083 }
3084 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3085 }
3086 if (subchain_p)
3087 {
3088 SLP_TREE_LOAD_PERMUTATION (node).release ();
3089 continue;
3090 }
3091 }
3092 else
3093 {
3094 stmt_vec_info load_info;
3095 bool this_load_permuted = false;
3096 unsigned j;
3097 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3098 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3099 {
3100 this_load_permuted = true;
3101 break;
3102 }
3103 stmt_vec_info first_stmt_info
3104 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3105 if (!this_load_permuted
3106 /* The load requires permutation when unrolling exposes
3107 a gap either because the group is larger than the SLP
3108 group-size or because there is a gap between the groups. */
3109 && (known_eq (LOOP_VINFO_VECT_FACTOR
3110 (as_a <loop_vec_info> (vinfo)), 1U)
3111 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3112 && DR_GROUP_GAP (first_stmt_info) == 0)))
3113 {
3114 SLP_TREE_LOAD_PERMUTATION (node).release ();
3115 continue;
3116 }
3117 }
3118 }
3119 }
3120
3121
3122 /* For each possible SLP instance decide whether to SLP it and calculate overall
3123 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3124 least one instance. */
3125
3126 bool
3127 vect_make_slp_decision (loop_vec_info loop_vinfo)
3128 {
3129 unsigned int i;
3130 poly_uint64 unrolling_factor = 1;
3131 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3132 slp_instance instance;
3133 int decided_to_slp = 0;
3134
3135 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3136
3137 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3138 {
3139 /* FORNOW: SLP if you can. */
3140 /* All unroll factors have the form:
3141
3142 GET_MODE_SIZE (vinfo->vector_mode) * X
3143
3144 for some rational X, so they must have a common multiple. */
3145 unrolling_factor
3146 = force_common_multiple (unrolling_factor,
3147 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3148
3149 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3150 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3151 loop-based vectorization. Such stmts will be marked as HYBRID. */
3152 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3153 decided_to_slp++;
3154 }
3155
3156 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3157
3158 if (decided_to_slp && dump_enabled_p ())
3159 {
3160 dump_printf_loc (MSG_NOTE, vect_location,
3161 "Decided to SLP %d instances. Unrolling factor ",
3162 decided_to_slp);
3163 dump_dec (MSG_NOTE, unrolling_factor);
3164 dump_printf (MSG_NOTE, "\n");
3165 }
3166
3167 return (decided_to_slp > 0);
3168 }
3169
3170 /* Private data for vect_detect_hybrid_slp. */
3171 struct vdhs_data
3172 {
3173 loop_vec_info loop_vinfo;
3174 vec<stmt_vec_info> *worklist;
3175 };
3176
3177 /* Walker for walk_gimple_op. */
3178
3179 static tree
3180 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3181 {
3182 walk_stmt_info *wi = (walk_stmt_info *)data;
3183 vdhs_data *dat = (vdhs_data *)wi->info;
3184
3185 if (wi->is_lhs)
3186 return NULL_TREE;
3187
3188 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3189 if (!def_stmt_info)
3190 return NULL_TREE;
3191 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3192 if (PURE_SLP_STMT (def_stmt_info))
3193 {
3194 if (dump_enabled_p ())
3195 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3196 def_stmt_info->stmt);
3197 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3198 dat->worklist->safe_push (def_stmt_info);
3199 }
3200
3201 return NULL_TREE;
3202 }
3203
3204 /* Find stmts that must be both vectorized and SLPed. */
3205
3206 void
3207 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3208 {
3209 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3210
3211 /* All stmts participating in SLP are marked pure_slp, all other
3212 stmts are loop_vect.
3213 First collect all loop_vect stmts into a worklist. */
3214 auto_vec<stmt_vec_info> worklist;
3215 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3216 {
3217 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3218 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3219 gsi_next (&gsi))
3220 {
3221 gphi *phi = gsi.phi ();
3222 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3223 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3224 worklist.safe_push (stmt_info);
3225 }
3226 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3227 gsi_next (&gsi))
3228 {
3229 gimple *stmt = gsi_stmt (gsi);
3230 if (is_gimple_debug (stmt))
3231 continue;
3232 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3233 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3234 {
3235 for (gimple_stmt_iterator gsi2
3236 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3237 !gsi_end_p (gsi2); gsi_next (&gsi2))
3238 {
3239 stmt_vec_info patt_info
3240 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3241 if (!STMT_SLP_TYPE (patt_info)
3242 && STMT_VINFO_RELEVANT (patt_info))
3243 worklist.safe_push (patt_info);
3244 }
3245 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3246 }
3247 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3248 worklist.safe_push (stmt_info);
3249 }
3250 }
3251
3252 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3253 mark any SLP vectorized stmt as hybrid.
3254 ??? We're visiting def stmts N times (once for each non-SLP and
3255 once for each hybrid-SLP use). */
3256 walk_stmt_info wi;
3257 vdhs_data dat;
3258 dat.worklist = &worklist;
3259 dat.loop_vinfo = loop_vinfo;
3260 memset (&wi, 0, sizeof (wi));
3261 wi.info = (void *)&dat;
3262 while (!worklist.is_empty ())
3263 {
3264 stmt_vec_info stmt_info = worklist.pop ();
3265 /* Since SSA operands are not set up for pattern stmts we need
3266 to use walk_gimple_op. */
3267 wi.is_lhs = 0;
3268 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3269 }
3270 }
3271
3272
3273 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3274
3275 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3276 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
3277 {
3278 for (unsigned i = 0; i < bbs.length (); ++i)
3279 {
3280 if (i != 0)
3281 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3282 gsi_next (&si))
3283 {
3284 gphi *phi = si.phi ();
3285 gimple_set_uid (phi, 0);
3286 add_stmt (phi);
3287 }
3288 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3289 !gsi_end_p (gsi); gsi_next (&gsi))
3290 {
3291 gimple *stmt = gsi_stmt (gsi);
3292 gimple_set_uid (stmt, 0);
3293 if (is_gimple_debug (stmt))
3294 continue;
3295 add_stmt (stmt);
3296 }
3297 }
3298 }
3299
3300
3301 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3302 stmts in the basic block. */
3303
3304 _bb_vec_info::~_bb_vec_info ()
3305 {
3306 /* Reset region marker. */
3307 for (unsigned i = 0; i < bbs.length (); ++i)
3308 {
3309 if (i != 0)
3310 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3311 gsi_next (&si))
3312 {
3313 gphi *phi = si.phi ();
3314 gimple_set_uid (phi, -1);
3315 }
3316 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3317 !gsi_end_p (gsi); gsi_next (&gsi))
3318 {
3319 gimple *stmt = gsi_stmt (gsi);
3320 gimple_set_uid (stmt, -1);
3321 }
3322 }
3323 }
3324
3325 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3326 given then that child nodes have already been processed, and that
3327 their def types currently match their SLP node's def type. */
3328
3329 static bool
3330 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3331 slp_instance node_instance,
3332 stmt_vector_for_cost *cost_vec)
3333 {
3334 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3335 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3336
3337 /* Calculate the number of vector statements to be created for the
3338 scalar stmts in this node. For SLP reductions it is equal to the
3339 number of vector statements in the children (which has already been
3340 calculated by the recursive call). Otherwise it is the number of
3341 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3342 VF divided by the number of elements in a vector. */
3343 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3344 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3345 {
3346 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3347 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3348 {
3349 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3350 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3351 break;
3352 }
3353 }
3354 else
3355 {
3356 poly_uint64 vf;
3357 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3358 vf = loop_vinfo->vectorization_factor;
3359 else
3360 vf = 1;
3361 unsigned int group_size = SLP_TREE_LANES (node);
3362 tree vectype = SLP_TREE_VECTYPE (node);
3363 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3364 = vect_get_num_vectors (vf * group_size, vectype);
3365 }
3366
3367 /* Handle purely internal nodes. */
3368 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3369 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3370
3371 if (is_a <bb_vec_info> (vinfo)
3372 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3373 {
3374 if (dump_enabled_p ())
3375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3376 "desired vector type conflicts with earlier one "
3377 "for %G", stmt_info->stmt);
3378 return false;
3379 }
3380
3381 bool dummy;
3382 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3383 node, node_instance, cost_vec);
3384 }
3385
3386 /* Try to build NODE from scalars, returning true on success.
3387 NODE_INSTANCE is the SLP instance that contains NODE. */
3388
3389 static bool
3390 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3391 slp_instance node_instance)
3392 {
3393 stmt_vec_info stmt_info;
3394 unsigned int i;
3395
3396 if (!is_a <bb_vec_info> (vinfo)
3397 || node == SLP_INSTANCE_TREE (node_instance)
3398 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3399 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3400 return false;
3401
3402 if (dump_enabled_p ())
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "Building vector operands of %p from scalars instead\n", node);
3405
3406 /* Don't remove and free the child nodes here, since they could be
3407 referenced by other structures. The analysis and scheduling phases
3408 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3409 unsigned int group_size = SLP_TREE_LANES (node);
3410 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3411 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3412 SLP_TREE_LOAD_PERMUTATION (node).release ();
3413 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3414 {
3415 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3416 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3417 }
3418 return true;
3419 }
3420
3421 /* Compute the prologue cost for invariant or constant operands represented
3422 by NODE. */
3423
3424 static void
3425 vect_prologue_cost_for_slp (slp_tree node,
3426 stmt_vector_for_cost *cost_vec)
3427 {
3428 /* There's a special case of an existing vector, that costs nothing. */
3429 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3430 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3431 return;
3432 /* Without looking at the actual initializer a vector of
3433 constants can be implemented as load from the constant pool.
3434 When all elements are the same we can use a splat. */
3435 tree vectype = SLP_TREE_VECTYPE (node);
3436 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3437 unsigned num_vects_to_check;
3438 unsigned HOST_WIDE_INT const_nunits;
3439 unsigned nelt_limit;
3440 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3441 && ! multiple_p (const_nunits, group_size))
3442 {
3443 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3444 nelt_limit = const_nunits;
3445 }
3446 else
3447 {
3448 /* If either the vector has variable length or the vectors
3449 are composed of repeated whole groups we only need to
3450 cost construction once. All vectors will be the same. */
3451 num_vects_to_check = 1;
3452 nelt_limit = group_size;
3453 }
3454 tree elt = NULL_TREE;
3455 unsigned nelt = 0;
3456 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3457 {
3458 unsigned si = j % group_size;
3459 if (nelt == 0)
3460 elt = SLP_TREE_SCALAR_OPS (node)[si];
3461 /* ??? We're just tracking whether all operands of a single
3462 vector initializer are the same, ideally we'd check if
3463 we emitted the same one already. */
3464 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3465 elt = NULL_TREE;
3466 nelt++;
3467 if (nelt == nelt_limit)
3468 {
3469 record_stmt_cost (cost_vec, 1,
3470 SLP_TREE_DEF_TYPE (node) == vect_external_def
3471 ? (elt ? scalar_to_vec : vec_construct)
3472 : vector_load,
3473 NULL, vectype, 0, vect_prologue);
3474 nelt = 0;
3475 }
3476 }
3477 }
3478
3479 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3480 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3481
3482 Return true if the operations are supported. */
3483
3484 static bool
3485 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3486 slp_instance node_instance,
3487 hash_set<slp_tree> &visited_set,
3488 vec<slp_tree> &visited_vec,
3489 stmt_vector_for_cost *cost_vec)
3490 {
3491 int i, j;
3492 slp_tree child;
3493
3494 /* Assume we can code-generate all invariants. */
3495 if (!node
3496 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3497 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3498 return true;
3499
3500 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3501 {
3502 if (dump_enabled_p ())
3503 dump_printf_loc (MSG_NOTE, vect_location,
3504 "Failed cyclic SLP reference in %p", node);
3505 return false;
3506 }
3507 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3508
3509 /* If we already analyzed the exact same set of scalar stmts we're done.
3510 We share the generated vector stmts for those. */
3511 if (visited_set.add (node))
3512 return true;
3513 visited_vec.safe_push (node);
3514
3515 bool res = true;
3516 unsigned visited_rec_start = visited_vec.length ();
3517 unsigned cost_vec_rec_start = cost_vec->length ();
3518 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3519 {
3520 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3521 visited_set, visited_vec,
3522 cost_vec);
3523 if (!res)
3524 break;
3525 }
3526
3527 if (res)
3528 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3529 cost_vec);
3530 /* If analysis failed we have to pop all recursive visited nodes
3531 plus ourselves. */
3532 if (!res)
3533 {
3534 while (visited_vec.length () >= visited_rec_start)
3535 visited_set.remove (visited_vec.pop ());
3536 cost_vec->truncate (cost_vec_rec_start);
3537 }
3538
3539 /* When the node can be vectorized cost invariant nodes it references.
3540 This is not done in DFS order to allow the refering node
3541 vectorizable_* calls to nail down the invariant nodes vector type
3542 and possibly unshare it if it needs a different vector type than
3543 other referrers. */
3544 if (res)
3545 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3546 if (child
3547 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3548 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3549 /* Perform usual caching, note code-generation still
3550 code-gens these nodes multiple times but we expect
3551 to CSE them later. */
3552 && !visited_set.add (child))
3553 {
3554 visited_vec.safe_push (child);
3555 /* ??? After auditing more code paths make a "default"
3556 and push the vector type from NODE to all children
3557 if it is not already set. */
3558 /* Compute the number of vectors to be generated. */
3559 tree vector_type = SLP_TREE_VECTYPE (child);
3560 if (!vector_type)
3561 {
3562 /* For shifts with a scalar argument we don't need
3563 to cost or code-generate anything.
3564 ??? Represent this more explicitely. */
3565 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3566 == shift_vec_info_type)
3567 && j == 1);
3568 continue;
3569 }
3570 unsigned group_size = SLP_TREE_LANES (child);
3571 poly_uint64 vf = 1;
3572 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3573 vf = loop_vinfo->vectorization_factor;
3574 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3575 = vect_get_num_vectors (vf * group_size, vector_type);
3576 /* And cost them. */
3577 vect_prologue_cost_for_slp (child, cost_vec);
3578 }
3579
3580 /* If this node or any of its children can't be vectorized, try pruning
3581 the tree here rather than felling the whole thing. */
3582 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3583 {
3584 /* We'll need to revisit this for invariant costing and number
3585 of vectorized stmt setting. */
3586 res = true;
3587 }
3588
3589 return res;
3590 }
3591
3592
3593 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3594 region and that can be vectorized using vectorizable_live_operation
3595 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3596 scalar code computing it to be retained. */
3597
3598 static void
3599 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3600 slp_instance instance,
3601 stmt_vector_for_cost *cost_vec,
3602 hash_set<stmt_vec_info> &svisited,
3603 hash_set<slp_tree> &visited)
3604 {
3605 if (visited.add (node))
3606 return;
3607
3608 unsigned i;
3609 stmt_vec_info stmt_info;
3610 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3611 bool all_visited = true;
3612 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3613 {
3614 if (svisited.contains (stmt_info))
3615 continue;
3616 all_visited = false;
3617 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3618 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3619 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3620 /* Only the pattern root stmt computes the original scalar value. */
3621 continue;
3622 bool mark_visited = true;
3623 gimple *orig_stmt = orig_stmt_info->stmt;
3624 ssa_op_iter op_iter;
3625 def_operand_p def_p;
3626 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3627 {
3628 imm_use_iterator use_iter;
3629 gimple *use_stmt;
3630 stmt_vec_info use_stmt_info;
3631 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3632 if (!is_gimple_debug (use_stmt))
3633 {
3634 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3635 if (!use_stmt_info
3636 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3637 {
3638 STMT_VINFO_LIVE_P (stmt_info) = true;
3639 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3640 NULL, node, instance, i,
3641 false, cost_vec))
3642 /* ??? So we know we can vectorize the live stmt
3643 from one SLP node. If we cannot do so from all
3644 or none consistently we'd have to record which
3645 SLP node (and lane) we want to use for the live
3646 operation. So make sure we can code-generate
3647 from all nodes. */
3648 mark_visited = false;
3649 else
3650 STMT_VINFO_LIVE_P (stmt_info) = false;
3651 BREAK_FROM_IMM_USE_STMT (use_iter);
3652 }
3653 }
3654 /* We have to verify whether we can insert the lane extract
3655 before all uses. The following is a conservative approximation.
3656 We cannot put this into vectorizable_live_operation because
3657 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3658 doesn't work.
3659 Note that while the fact that we emit code for loads at the
3660 first load should make this a non-problem leafs we construct
3661 from scalars are vectorized after the last scalar def.
3662 ??? If we'd actually compute the insert location during
3663 analysis we could use sth less conservative than the last
3664 scalar stmt in the node for the dominance check. */
3665 /* ??? What remains is "live" uses in vector CTORs in the same
3666 SLP graph which is where those uses can end up code-generated
3667 right after their definition instead of close to their original
3668 use. But that would restrict us to code-generate lane-extracts
3669 from the latest stmt in a node. So we compensate for this
3670 during code-generation, simply not replacing uses for those
3671 hopefully rare cases. */
3672 if (STMT_VINFO_LIVE_P (stmt_info))
3673 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3674 if (!is_gimple_debug (use_stmt)
3675 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3676 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3677 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3678 {
3679 if (dump_enabled_p ())
3680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3681 "Cannot determine insertion place for "
3682 "lane extract\n");
3683 STMT_VINFO_LIVE_P (stmt_info) = false;
3684 mark_visited = true;
3685 }
3686 }
3687 if (mark_visited)
3688 svisited.add (stmt_info);
3689 }
3690 if (all_visited)
3691 return;
3692
3693 slp_tree child;
3694 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3695 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3696 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3697 cost_vec, svisited, visited);
3698 }
3699
3700 /* Analyze statements in SLP instances of VINFO. Return true if the
3701 operations are supported. */
3702
3703 bool
3704 vect_slp_analyze_operations (vec_info *vinfo)
3705 {
3706 slp_instance instance;
3707 int i;
3708
3709 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3710
3711 hash_set<slp_tree> visited;
3712 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3713 {
3714 auto_vec<slp_tree> visited_vec;
3715 stmt_vector_for_cost cost_vec;
3716 cost_vec.create (2);
3717 if (is_a <bb_vec_info> (vinfo))
3718 vect_location = instance->location ();
3719 if (!vect_slp_analyze_node_operations (vinfo,
3720 SLP_INSTANCE_TREE (instance),
3721 instance, visited, visited_vec,
3722 &cost_vec)
3723 /* Instances with a root stmt require vectorized defs for the
3724 SLP tree root. */
3725 || (SLP_INSTANCE_ROOT_STMT (instance)
3726 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3727 != vect_internal_def)))
3728 {
3729 slp_tree node = SLP_INSTANCE_TREE (instance);
3730 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3731 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_NOTE, vect_location,
3733 "removing SLP instance operations starting from: %G",
3734 stmt_info->stmt);
3735 vect_free_slp_instance (instance);
3736 vinfo->slp_instances.ordered_remove (i);
3737 cost_vec.release ();
3738 while (!visited_vec.is_empty ())
3739 visited.remove (visited_vec.pop ());
3740 }
3741 else
3742 {
3743 i++;
3744
3745 /* For BB vectorization remember the SLP graph entry
3746 cost for later. */
3747 if (is_a <bb_vec_info> (vinfo))
3748 instance->cost_vec = cost_vec;
3749 else
3750 {
3751 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3752 cost_vec.release ();
3753 }
3754 }
3755 }
3756
3757 /* Compute vectorizable live stmts. */
3758 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3759 {
3760 hash_set<stmt_vec_info> svisited;
3761 hash_set<slp_tree> visited;
3762 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3763 {
3764 vect_location = instance->location ();
3765 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3766 instance, &instance->cost_vec, svisited,
3767 visited);
3768 }
3769 }
3770
3771 return !vinfo->slp_instances.is_empty ();
3772 }
3773
3774 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3775 closing the eventual chain. */
3776
3777 static slp_instance
3778 get_ultimate_leader (slp_instance instance,
3779 hash_map<slp_instance, slp_instance> &instance_leader)
3780 {
3781 auto_vec<slp_instance *, 8> chain;
3782 slp_instance *tem;
3783 while (*(tem = instance_leader.get (instance)) != instance)
3784 {
3785 chain.safe_push (tem);
3786 instance = *tem;
3787 }
3788 while (!chain.is_empty ())
3789 *chain.pop () = instance;
3790 return instance;
3791 }
3792
3793 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3794
3795 static void
3796 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3797 slp_instance instance, slp_tree node,
3798 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3799 hash_map<slp_instance, slp_instance> &instance_leader,
3800 hash_set<slp_tree> &visited)
3801 {
3802 stmt_vec_info stmt_info;
3803 unsigned i;
3804
3805 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3806 {
3807 bool existed_p;
3808 slp_instance &stmt_instance
3809 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3810 if (!existed_p)
3811 ;
3812 else if (stmt_instance != instance)
3813 {
3814 /* If we're running into a previously marked stmt make us the
3815 leader of the current ultimate leader. This keeps the
3816 leader chain acyclic and works even when the current instance
3817 connects two previously independent graph parts. */
3818 slp_instance stmt_leader
3819 = get_ultimate_leader (stmt_instance, instance_leader);
3820 if (stmt_leader != instance)
3821 instance_leader.put (stmt_leader, instance);
3822 }
3823 stmt_instance = instance;
3824 }
3825
3826 if (visited.add (node))
3827 return;
3828
3829 slp_tree child;
3830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3831 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3832 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3833 instance_leader, visited);
3834 }
3835
3836 /* Partition the SLP graph into pieces that can be costed independently. */
3837
3838 static void
3839 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3840 {
3841 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3842
3843 /* First walk the SLP graph assigning each involved scalar stmt a
3844 corresponding SLP graph entry and upon visiting a previously
3845 marked stmt, make the stmts leader the current SLP graph entry. */
3846 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3847 hash_map<slp_instance, slp_instance> instance_leader;
3848 hash_set<slp_tree> visited;
3849 slp_instance instance;
3850 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3851 {
3852 instance_leader.put (instance, instance);
3853 vect_bb_partition_graph_r (bb_vinfo,
3854 instance, SLP_INSTANCE_TREE (instance),
3855 stmt_to_instance, instance_leader,
3856 visited);
3857 }
3858
3859 /* Then collect entries to each independent subgraph. */
3860 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3861 {
3862 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3863 leader->subgraph_entries.safe_push (instance);
3864 if (dump_enabled_p ()
3865 && leader != instance)
3866 dump_printf_loc (MSG_NOTE, vect_location,
3867 "instance %p is leader of %p\n",
3868 leader, instance);
3869 }
3870 }
3871
3872 /* Compute the scalar cost of the SLP node NODE and its children
3873 and return it. Do not account defs that are marked in LIFE and
3874 update LIFE according to uses of NODE. */
3875
3876 static void
3877 vect_bb_slp_scalar_cost (vec_info *vinfo,
3878 slp_tree node, vec<bool, va_heap> *life,
3879 stmt_vector_for_cost *cost_vec,
3880 hash_set<slp_tree> &visited)
3881 {
3882 unsigned i;
3883 stmt_vec_info stmt_info;
3884 slp_tree child;
3885
3886 if (visited.add (node))
3887 return;
3888
3889 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3890 {
3891 ssa_op_iter op_iter;
3892 def_operand_p def_p;
3893
3894 if ((*life)[i])
3895 continue;
3896
3897 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3898 gimple *orig_stmt = orig_stmt_info->stmt;
3899
3900 /* If there is a non-vectorized use of the defs then the scalar
3901 stmt is kept live in which case we do not account it or any
3902 required defs in the SLP children in the scalar cost. This
3903 way we make the vectorization more costly when compared to
3904 the scalar cost. */
3905 if (!STMT_VINFO_LIVE_P (stmt_info))
3906 {
3907 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3908 {
3909 imm_use_iterator use_iter;
3910 gimple *use_stmt;
3911 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3912 if (!is_gimple_debug (use_stmt))
3913 {
3914 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3915 if (!use_stmt_info
3916 || !PURE_SLP_STMT
3917 (vect_stmt_to_vectorize (use_stmt_info)))
3918 {
3919 (*life)[i] = true;
3920 BREAK_FROM_IMM_USE_STMT (use_iter);
3921 }
3922 }
3923 }
3924 if ((*life)[i])
3925 continue;
3926 }
3927
3928 /* Count scalar stmts only once. */
3929 if (gimple_visited_p (orig_stmt))
3930 continue;
3931 gimple_set_visited (orig_stmt, true);
3932
3933 vect_cost_for_stmt kind;
3934 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3935 {
3936 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3937 kind = scalar_load;
3938 else
3939 kind = scalar_store;
3940 }
3941 else if (vect_nop_conversion_p (orig_stmt_info))
3942 continue;
3943 else
3944 kind = scalar_stmt;
3945 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
3946 SLP_TREE_VECTYPE (node), 0, vect_body);
3947 }
3948
3949 auto_vec<bool, 20> subtree_life;
3950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3951 {
3952 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3953 {
3954 /* Do not directly pass LIFE to the recursive call, copy it to
3955 confine changes in the callee to the current child/subtree. */
3956 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3957 {
3958 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3959 for (unsigned j = 0;
3960 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3961 {
3962 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3963 if (perm.first == i)
3964 subtree_life[perm.second] = (*life)[j];
3965 }
3966 }
3967 else
3968 {
3969 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3970 subtree_life.safe_splice (*life);
3971 }
3972 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3973 visited);
3974 subtree_life.truncate (0);
3975 }
3976 }
3977 }
3978
3979 /* Check if vectorization of the basic block is profitable for the
3980 subgraph denoted by SLP_INSTANCES. */
3981
3982 static bool
3983 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3984 vec<slp_instance> slp_instances)
3985 {
3986 slp_instance instance;
3987 int i;
3988 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3989 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3990
3991 void *vect_target_cost_data = init_cost (NULL);
3992
3993 /* Calculate scalar cost and sum the cost for the vector stmts
3994 previously collected. */
3995 stmt_vector_for_cost scalar_costs;
3996 scalar_costs.create (0);
3997 hash_set<slp_tree> visited;
3998 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3999 {
4000 auto_vec<bool, 20> life;
4001 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4002 true);
4003 vect_bb_slp_scalar_cost (bb_vinfo,
4004 SLP_INSTANCE_TREE (instance),
4005 &life, &scalar_costs, visited);
4006 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4007 instance->cost_vec.release ();
4008 }
4009 /* Unset visited flag. */
4010 stmt_info_for_cost *si;
4011 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4012 gimple_set_visited (si->stmt_info->stmt, false);
4013
4014 void *scalar_target_cost_data = init_cost (NULL);
4015 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4016 scalar_costs.release ();
4017 unsigned dummy;
4018 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4019 destroy_cost_data (scalar_target_cost_data);
4020
4021 /* Complete the target-specific vector cost calculation. */
4022 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4023 &vec_inside_cost, &vec_epilogue_cost);
4024 destroy_cost_data (vect_target_cost_data);
4025
4026 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4027
4028 if (dump_enabled_p ())
4029 {
4030 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4031 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4032 vec_inside_cost);
4033 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4034 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4035 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4036 }
4037
4038 /* Vectorization is profitable if its cost is more than the cost of scalar
4039 version. Note that we err on the vector side for equal cost because
4040 the cost estimate is otherwise quite pessimistic (constant uses are
4041 free on the scalar side but cost a load on the vector side for
4042 example). */
4043 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4044 return false;
4045
4046 return true;
4047 }
4048
4049 /* Find any vectorizable constructors and add them to the grouped_store
4050 array. */
4051
4052 static void
4053 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4054 {
4055 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4056 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4057 !gsi_end_p (gsi); gsi_next (&gsi))
4058 {
4059 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4060 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
4061 continue;
4062
4063 tree rhs = gimple_assign_rhs1 (assign);
4064 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4065 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4066 CONSTRUCTOR_NELTS (rhs))
4067 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4068 || uniform_vector_p (rhs))
4069 continue;
4070
4071 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4072 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4073 }
4074 }
4075
4076 /* Check if the region described by BB_VINFO can be vectorized, returning
4077 true if so. When returning false, set FATAL to true if the same failure
4078 would prevent vectorization at other vector sizes, false if it is still
4079 worth trying other sizes. N_STMTS is the number of statements in the
4080 region. */
4081
4082 static bool
4083 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4084 vec<int> *dataref_groups)
4085 {
4086 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4087
4088 slp_instance instance;
4089 int i;
4090 poly_uint64 min_vf = 2;
4091
4092 /* The first group of checks is independent of the vector size. */
4093 fatal = true;
4094
4095 /* Analyze the data references. */
4096
4097 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4098 {
4099 if (dump_enabled_p ())
4100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4101 "not vectorized: unhandled data-ref in basic "
4102 "block.\n");
4103 return false;
4104 }
4105
4106 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4107 {
4108 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4110 "not vectorized: unhandled data access in "
4111 "basic block.\n");
4112 return false;
4113 }
4114
4115 vect_slp_check_for_constructors (bb_vinfo);
4116
4117 /* If there are no grouped stores and no constructors in the region
4118 there is no need to continue with pattern recog as vect_analyze_slp
4119 will fail anyway. */
4120 if (bb_vinfo->grouped_stores.is_empty ())
4121 {
4122 if (dump_enabled_p ())
4123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4124 "not vectorized: no grouped stores in "
4125 "basic block.\n");
4126 return false;
4127 }
4128
4129 /* While the rest of the analysis below depends on it in some way. */
4130 fatal = false;
4131
4132 vect_pattern_recog (bb_vinfo);
4133
4134 /* Check the SLP opportunities in the basic block, analyze and build SLP
4135 trees. */
4136 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4137 {
4138 if (dump_enabled_p ())
4139 {
4140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4141 "Failed to SLP the basic block.\n");
4142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4143 "not vectorized: failed to find SLP opportunities "
4144 "in basic block.\n");
4145 }
4146 return false;
4147 }
4148
4149 /* Optimize permutations. */
4150 vect_optimize_slp (bb_vinfo);
4151
4152 vect_record_base_alignments (bb_vinfo);
4153
4154 /* Analyze and verify the alignment of data references and the
4155 dependence in the SLP instances. */
4156 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4157 {
4158 vect_location = instance->location ();
4159 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4160 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4161 {
4162 slp_tree node = SLP_INSTANCE_TREE (instance);
4163 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4164 if (dump_enabled_p ())
4165 dump_printf_loc (MSG_NOTE, vect_location,
4166 "removing SLP instance operations starting from: %G",
4167 stmt_info->stmt);
4168 vect_free_slp_instance (instance);
4169 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4170 continue;
4171 }
4172
4173 /* Mark all the statements that we want to vectorize as pure SLP and
4174 relevant. */
4175 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4176 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4177 if (SLP_INSTANCE_ROOT_STMT (instance))
4178 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
4179
4180 i++;
4181 }
4182 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4183 return false;
4184
4185 if (!vect_slp_analyze_operations (bb_vinfo))
4186 {
4187 if (dump_enabled_p ())
4188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4189 "not vectorized: bad operation in basic block.\n");
4190 return false;
4191 }
4192
4193 vect_bb_partition_graph (bb_vinfo);
4194
4195 return true;
4196 }
4197
4198 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4199 basic blocks in BBS, returning true on success.
4200 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4201
4202 static bool
4203 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4204 vec<int> *dataref_groups, unsigned int n_stmts)
4205 {
4206 bb_vec_info bb_vinfo;
4207 auto_vector_modes vector_modes;
4208
4209 /* Autodetect first vector size we try. */
4210 machine_mode next_vector_mode = VOIDmode;
4211 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4212 unsigned int mode_i = 0;
4213
4214 vec_info_shared shared;
4215
4216 machine_mode autodetected_vector_mode = VOIDmode;
4217 while (1)
4218 {
4219 bool vectorized = false;
4220 bool fatal = false;
4221 bb_vinfo = new _bb_vec_info (bbs, &shared);
4222
4223 bool first_time_p = shared.datarefs.is_empty ();
4224 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4225 if (first_time_p)
4226 bb_vinfo->shared->save_datarefs ();
4227 else
4228 bb_vinfo->shared->check_datarefs ();
4229 bb_vinfo->vector_mode = next_vector_mode;
4230
4231 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
4232 && dbg_cnt (vect_slp))
4233 {
4234 if (dump_enabled_p ())
4235 {
4236 dump_printf_loc (MSG_NOTE, vect_location,
4237 "***** Analysis succeeded with vector mode"
4238 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4239 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4240 }
4241
4242 bb_vinfo->shared->check_datarefs ();
4243
4244 unsigned i;
4245 slp_instance instance;
4246 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4247 {
4248 if (instance->subgraph_entries.is_empty ())
4249 continue;
4250
4251 vect_location = instance->location ();
4252 if (!unlimited_cost_model (NULL)
4253 && !vect_bb_vectorization_profitable_p
4254 (bb_vinfo, instance->subgraph_entries))
4255 {
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258 "not vectorized: vectorization is not "
4259 "profitable.\n");
4260 continue;
4261 }
4262
4263 if (!vectorized && dump_enabled_p ())
4264 dump_printf_loc (MSG_NOTE, vect_location,
4265 "Basic block will be vectorized "
4266 "using SLP\n");
4267 vectorized = true;
4268
4269 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4270
4271 unsigned HOST_WIDE_INT bytes;
4272 if (dump_enabled_p ())
4273 {
4274 if (GET_MODE_SIZE
4275 (bb_vinfo->vector_mode).is_constant (&bytes))
4276 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4277 "basic block part vectorized using %wu "
4278 "byte vectors\n", bytes);
4279 else
4280 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4281 "basic block part vectorized using "
4282 "variable length vectors\n");
4283 }
4284 }
4285 }
4286 else
4287 {
4288 if (dump_enabled_p ())
4289 dump_printf_loc (MSG_NOTE, vect_location,
4290 "***** Analysis failed with vector mode %s\n",
4291 GET_MODE_NAME (bb_vinfo->vector_mode));
4292 }
4293
4294 if (mode_i == 0)
4295 autodetected_vector_mode = bb_vinfo->vector_mode;
4296
4297 if (!fatal)
4298 while (mode_i < vector_modes.length ()
4299 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4300 {
4301 if (dump_enabled_p ())
4302 dump_printf_loc (MSG_NOTE, vect_location,
4303 "***** The result for vector mode %s would"
4304 " be the same\n",
4305 GET_MODE_NAME (vector_modes[mode_i]));
4306 mode_i += 1;
4307 }
4308
4309 delete bb_vinfo;
4310
4311 if (mode_i < vector_modes.length ()
4312 && VECTOR_MODE_P (autodetected_vector_mode)
4313 && (related_vector_mode (vector_modes[mode_i],
4314 GET_MODE_INNER (autodetected_vector_mode))
4315 == autodetected_vector_mode)
4316 && (related_vector_mode (autodetected_vector_mode,
4317 GET_MODE_INNER (vector_modes[mode_i]))
4318 == vector_modes[mode_i]))
4319 {
4320 if (dump_enabled_p ())
4321 dump_printf_loc (MSG_NOTE, vect_location,
4322 "***** Skipping vector mode %s, which would"
4323 " repeat the analysis for %s\n",
4324 GET_MODE_NAME (vector_modes[mode_i]),
4325 GET_MODE_NAME (autodetected_vector_mode));
4326 mode_i += 1;
4327 }
4328
4329 if (vectorized
4330 || mode_i == vector_modes.length ()
4331 || autodetected_vector_mode == VOIDmode
4332 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4333 vector sizes will fail do not bother iterating. */
4334 || fatal)
4335 return vectorized;
4336
4337 /* Try the next biggest vector size. */
4338 next_vector_mode = vector_modes[mode_i++];
4339 if (dump_enabled_p ())
4340 dump_printf_loc (MSG_NOTE, vect_location,
4341 "***** Re-trying analysis with vector mode %s\n",
4342 GET_MODE_NAME (next_vector_mode));
4343 }
4344 }
4345
4346
4347 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4348 true if anything in the basic-block was vectorized. */
4349
4350 static bool
4351 vect_slp_bbs (vec<basic_block> bbs)
4352 {
4353 vec<data_reference_p> datarefs = vNULL;
4354 auto_vec<int> dataref_groups;
4355 int insns = 0;
4356 int current_group = 0;
4357
4358 for (unsigned i = 0; i < bbs.length (); i++)
4359 {
4360 basic_block bb = bbs[i];
4361 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4362 gsi_next (&gsi))
4363 {
4364 gimple *stmt = gsi_stmt (gsi);
4365 if (is_gimple_debug (stmt))
4366 continue;
4367
4368 insns++;
4369
4370 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4371 vect_location = stmt;
4372
4373 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4374 &dataref_groups, current_group))
4375 ++current_group;
4376 }
4377 }
4378
4379 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4380 }
4381
4382 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4383 true if anything in the basic-block was vectorized. */
4384
4385 bool
4386 vect_slp_bb (basic_block bb)
4387 {
4388 auto_vec<basic_block> bbs;
4389 bbs.safe_push (bb);
4390 return vect_slp_bbs (bbs);
4391 }
4392
4393 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4394 true if anything in the basic-block was vectorized. */
4395
4396 bool
4397 vect_slp_function (function *fun)
4398 {
4399 bool r = false;
4400 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4401 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4402
4403 /* For the moment split the function into pieces to avoid making
4404 the iteration on the vector mode moot. Split at points we know
4405 to not handle well which is CFG merges (SLP discovery doesn't
4406 handle non-loop-header PHIs) and loop exits. Since pattern
4407 recog requires reverse iteration to visit uses before defs
4408 simply chop RPO into pieces. */
4409 auto_vec<basic_block> bbs;
4410 for (unsigned i = 0; i < n; i++)
4411 {
4412 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4413 bool split = false;
4414
4415 /* Split when a BB is not dominated by the first block. */
4416 if (!bbs.is_empty ()
4417 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4418 {
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4421 "splitting region at dominance boundary bb%d\n",
4422 bb->index);
4423 split = true;
4424 }
4425 /* Split when the loop determined by the first block
4426 is exited. This is because we eventually insert
4427 invariants at region begin. */
4428 else if (!bbs.is_empty ()
4429 && bbs[0]->loop_father != bb->loop_father
4430 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4431 {
4432 if (dump_enabled_p ())
4433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4434 "splitting region at loop %d exit at bb%d\n",
4435 bbs[0]->loop_father->num, bb->index);
4436 split = true;
4437 }
4438
4439 if (split && !bbs.is_empty ())
4440 {
4441 r |= vect_slp_bbs (bbs);
4442 bbs.truncate (0);
4443 bbs.quick_push (bb);
4444 }
4445 else
4446 bbs.safe_push (bb);
4447
4448 /* When we have a stmt ending this block and defining a
4449 value we have to insert on edges when inserting after it for
4450 a vector containing its definition. Avoid this for now. */
4451 if (gimple *last = last_stmt (bb))
4452 if (gimple_get_lhs (last)
4453 && is_ctrl_altering_stmt (last))
4454 {
4455 if (dump_enabled_p ())
4456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4457 "splitting region at control altering "
4458 "definition %G", last);
4459 r |= vect_slp_bbs (bbs);
4460 bbs.truncate (0);
4461 }
4462 }
4463
4464 if (!bbs.is_empty ())
4465 r |= vect_slp_bbs (bbs);
4466
4467 free (rpo);
4468
4469 return r;
4470 }
4471
4472 /* Build a variable-length vector in which the elements in ELTS are repeated
4473 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4474 RESULTS and add any new instructions to SEQ.
4475
4476 The approach we use is:
4477
4478 (1) Find a vector mode VM with integer elements of mode IM.
4479
4480 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4481 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4482 from small vectors to IM.
4483
4484 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4485
4486 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4487 correct byte contents.
4488
4489 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4490
4491 We try to find the largest IM for which this sequence works, in order
4492 to cut down on the number of interleaves. */
4493
4494 void
4495 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
4496 vec<tree> elts, unsigned int nresults,
4497 vec<tree> &results)
4498 {
4499 unsigned int nelts = elts.length ();
4500 tree element_type = TREE_TYPE (vector_type);
4501
4502 /* (1) Find a vector mode VM with integer elements of mode IM. */
4503 unsigned int nvectors = 1;
4504 tree new_vector_type;
4505 tree permutes[2];
4506 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
4507 &nvectors, &new_vector_type,
4508 permutes))
4509 gcc_unreachable ();
4510
4511 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4512 unsigned int partial_nelts = nelts / nvectors;
4513 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
4514
4515 tree_vector_builder partial_elts;
4516 auto_vec<tree, 32> pieces (nvectors * 2);
4517 pieces.quick_grow (nvectors * 2);
4518 for (unsigned int i = 0; i < nvectors; ++i)
4519 {
4520 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4521 ELTS' has mode IM. */
4522 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
4523 for (unsigned int j = 0; j < partial_nelts; ++j)
4524 partial_elts.quick_push (elts[i * partial_nelts + j]);
4525 tree t = gimple_build_vector (seq, &partial_elts);
4526 t = gimple_build (seq, VIEW_CONVERT_EXPR,
4527 TREE_TYPE (new_vector_type), t);
4528
4529 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4530 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
4531 }
4532
4533 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4534 correct byte contents.
4535
4536 We need to repeat the following operation log2(nvectors) times:
4537
4538 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4539 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4540
4541 However, if each input repeats every N elements and the VF is
4542 a multiple of N * 2, the HI result is the same as the LO. */
4543 unsigned int in_start = 0;
4544 unsigned int out_start = nvectors;
4545 unsigned int hi_start = nvectors / 2;
4546 /* A bound on the number of outputs needed to produce NRESULTS results
4547 in the final iteration. */
4548 unsigned int noutputs_bound = nvectors * nresults;
4549 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
4550 {
4551 noutputs_bound /= 2;
4552 unsigned int limit = MIN (noutputs_bound, nvectors);
4553 for (unsigned int i = 0; i < limit; ++i)
4554 {
4555 if ((i & 1) != 0
4556 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
4557 2 * in_repeat))
4558 {
4559 pieces[out_start + i] = pieces[out_start + i - 1];
4560 continue;
4561 }
4562
4563 tree output = make_ssa_name (new_vector_type);
4564 tree input1 = pieces[in_start + (i / 2)];
4565 tree input2 = pieces[in_start + (i / 2) + hi_start];
4566 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
4567 input1, input2,
4568 permutes[i & 1]);
4569 gimple_seq_add_stmt (seq, stmt);
4570 pieces[out_start + i] = output;
4571 }
4572 std::swap (in_start, out_start);
4573 }
4574
4575 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4576 results.reserve (nresults);
4577 for (unsigned int i = 0; i < nresults; ++i)
4578 if (i < nvectors)
4579 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
4580 pieces[in_start + i]));
4581 else
4582 results.quick_push (results[i - nvectors]);
4583 }
4584
4585
4586 /* For constant and loop invariant defs in OP_NODE this function creates
4587 vector defs that will be used in the vectorized stmts and stores them
4588 to SLP_TREE_VEC_DEFS of OP_NODE. */
4589
4590 static void
4591 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
4592 {
4593 unsigned HOST_WIDE_INT nunits;
4594 tree vec_cst;
4595 unsigned j, number_of_places_left_in_vector;
4596 tree vector_type;
4597 tree vop;
4598 int group_size = op_node->ops.length ();
4599 unsigned int vec_num, i;
4600 unsigned number_of_copies = 1;
4601 bool constant_p;
4602 gimple_seq ctor_seq = NULL;
4603 auto_vec<tree, 16> permute_results;
4604
4605 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4606 vector_type = SLP_TREE_VECTYPE (op_node);
4607
4608 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
4609 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
4610 auto_vec<tree> voprnds (number_of_vectors);
4611
4612 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4613 created vectors. It is greater than 1 if unrolling is performed.
4614
4615 For example, we have two scalar operands, s1 and s2 (e.g., group of
4616 strided accesses of size two), while NUNITS is four (i.e., four scalars
4617 of this type can be packed in a vector). The output vector will contain
4618 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4619 will be 2).
4620
4621 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4622 containing the operands.
4623
4624 For example, NUNITS is four as before, and the group size is 8
4625 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4626 {s5, s6, s7, s8}. */
4627
4628 /* When using duplicate_and_interleave, we just need one element for
4629 each scalar statement. */
4630 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4631 nunits = group_size;
4632
4633 number_of_copies = nunits * number_of_vectors / group_size;
4634
4635 number_of_places_left_in_vector = nunits;
4636 constant_p = true;
4637 tree_vector_builder elts (vector_type, nunits, 1);
4638 elts.quick_grow (nunits);
4639 stmt_vec_info insert_after = NULL;
4640 for (j = 0; j < number_of_copies; j++)
4641 {
4642 tree op;
4643 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
4644 {
4645 /* Create 'vect_ = {op0,op1,...,opn}'. */
4646 number_of_places_left_in_vector--;
4647 tree orig_op = op;
4648 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
4649 {
4650 if (CONSTANT_CLASS_P (op))
4651 {
4652 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4653 {
4654 /* Can't use VIEW_CONVERT_EXPR for booleans because
4655 of possibly different sizes of scalar value and
4656 vector element. */
4657 if (integer_zerop (op))
4658 op = build_int_cst (TREE_TYPE (vector_type), 0);
4659 else if (integer_onep (op))
4660 op = build_all_ones_cst (TREE_TYPE (vector_type));
4661 else
4662 gcc_unreachable ();
4663 }
4664 else
4665 op = fold_unary (VIEW_CONVERT_EXPR,
4666 TREE_TYPE (vector_type), op);
4667 gcc_assert (op && CONSTANT_CLASS_P (op));
4668 }
4669 else
4670 {
4671 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
4672 gimple *init_stmt;
4673 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4674 {
4675 tree true_val
4676 = build_all_ones_cst (TREE_TYPE (vector_type));
4677 tree false_val
4678 = build_zero_cst (TREE_TYPE (vector_type));
4679 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
4680 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
4681 op, true_val,
4682 false_val);
4683 }
4684 else
4685 {
4686 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4687 op);
4688 init_stmt
4689 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4690 op);
4691 }
4692 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4693 op = new_temp;
4694 }
4695 }
4696 elts[number_of_places_left_in_vector] = op;
4697 if (!CONSTANT_CLASS_P (op))
4698 constant_p = false;
4699 /* For BB vectorization we have to compute an insert location
4700 when a def is inside the analyzed region since we cannot
4701 simply insert at the BB start in this case. */
4702 stmt_vec_info opdef;
4703 if (TREE_CODE (orig_op) == SSA_NAME
4704 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4705 && is_a <bb_vec_info> (vinfo)
4706 && (opdef = vinfo->lookup_def (orig_op)))
4707 {
4708 if (!insert_after)
4709 insert_after = opdef;
4710 else
4711 insert_after = get_later_stmt (insert_after, opdef);
4712 }
4713
4714 if (number_of_places_left_in_vector == 0)
4715 {
4716 if (constant_p
4717 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4718 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4719 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4720 else
4721 {
4722 if (permute_results.is_empty ())
4723 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4724 elts, number_of_vectors,
4725 permute_results);
4726 vec_cst = permute_results[number_of_vectors - j - 1];
4727 }
4728 if (!gimple_seq_empty_p (ctor_seq))
4729 {
4730 if (insert_after)
4731 {
4732 gimple_stmt_iterator gsi;
4733 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
4734 {
4735 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
4736 gsi_insert_seq_before (&gsi, ctor_seq,
4737 GSI_CONTINUE_LINKING);
4738 }
4739 else if (!stmt_ends_bb_p (insert_after->stmt))
4740 {
4741 gsi = gsi_for_stmt (insert_after->stmt);
4742 gsi_insert_seq_after (&gsi, ctor_seq,
4743 GSI_CONTINUE_LINKING);
4744 }
4745 else
4746 {
4747 /* When we want to insert after a def where the
4748 defining stmt throws then insert on the fallthru
4749 edge. */
4750 edge e = find_fallthru_edge
4751 (gimple_bb (insert_after->stmt)->succs);
4752 basic_block new_bb
4753 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
4754 gcc_assert (!new_bb);
4755 }
4756 }
4757 else
4758 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4759 ctor_seq = NULL;
4760 }
4761 voprnds.quick_push (vec_cst);
4762 insert_after = NULL;
4763 number_of_places_left_in_vector = nunits;
4764 constant_p = true;
4765 elts.new_vector (vector_type, nunits, 1);
4766 elts.quick_grow (nunits);
4767 }
4768 }
4769 }
4770
4771 /* Since the vectors are created in the reverse order, we should invert
4772 them. */
4773 vec_num = voprnds.length ();
4774 for (j = vec_num; j != 0; j--)
4775 {
4776 vop = voprnds[j - 1];
4777 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4778 }
4779
4780 /* In case that VF is greater than the unrolling factor needed for the SLP
4781 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4782 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4783 to replicate the vectors. */
4784 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4785 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4786 i++)
4787 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4788 }
4789
4790 /* Get the Ith vectorized definition from SLP_NODE. */
4791
4792 tree
4793 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4794 {
4795 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4796 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4797 else
4798 return SLP_TREE_VEC_DEFS (slp_node)[i];
4799 }
4800
4801 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4802
4803 void
4804 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4805 {
4806 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4807 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4808 {
4809 unsigned j;
4810 gimple *vec_def_stmt;
4811 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4812 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4813 }
4814 else
4815 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4816 }
4817
4818 /* Get N vectorized definitions for SLP_NODE. */
4819
4820 void
4821 vect_get_slp_defs (vec_info *,
4822 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4823 {
4824 if (n == -1U)
4825 n = SLP_TREE_CHILDREN (slp_node).length ();
4826
4827 for (unsigned i = 0; i < n; ++i)
4828 {
4829 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4830 vec<tree> vec_defs = vNULL;
4831 vect_get_slp_defs (child, &vec_defs);
4832 vec_oprnds->quick_push (vec_defs);
4833 }
4834 }
4835
4836 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4837 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4838 permute statements for the SLP node NODE. Store the number of vector
4839 permute instructions in *N_PERMS and the number of vector load
4840 instructions in *N_LOADS. */
4841
4842 bool
4843 vect_transform_slp_perm_load (vec_info *vinfo,
4844 slp_tree node, vec<tree> dr_chain,
4845 gimple_stmt_iterator *gsi, poly_uint64 vf,
4846 bool analyze_only, unsigned *n_perms,
4847 unsigned int *n_loads)
4848 {
4849 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4850 int vec_index = 0;
4851 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4852 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4853 unsigned int mask_element;
4854 machine_mode mode;
4855
4856 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4857 return false;
4858
4859 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4860
4861 mode = TYPE_MODE (vectype);
4862 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4863
4864 /* Initialize the vect stmts of NODE to properly insert the generated
4865 stmts later. */
4866 if (! analyze_only)
4867 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4868 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4869 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4870
4871 /* Generate permutation masks for every NODE. Number of masks for each NODE
4872 is equal to GROUP_SIZE.
4873 E.g., we have a group of three nodes with three loads from the same
4874 location in each node, and the vector size is 4. I.e., we have a
4875 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4876 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4877 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4878 ...
4879
4880 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4881 The last mask is illegal since we assume two operands for permute
4882 operation, and the mask element values can't be outside that range.
4883 Hence, the last mask must be converted into {2,5,5,5}.
4884 For the first two permutations we need the first and the second input
4885 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4886 we need the second and the third vectors: {b1,c1,a2,b2} and
4887 {c2,a3,b3,c3}. */
4888
4889 int vect_stmts_counter = 0;
4890 unsigned int index = 0;
4891 int first_vec_index = -1;
4892 int second_vec_index = -1;
4893 bool noop_p = true;
4894 *n_perms = 0;
4895
4896 vec_perm_builder mask;
4897 unsigned int nelts_to_build;
4898 unsigned int nvectors_per_build;
4899 unsigned int in_nlanes;
4900 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4901 && multiple_p (nunits, group_size));
4902 if (repeating_p)
4903 {
4904 /* A single vector contains a whole number of copies of the node, so:
4905 (a) all permutes can use the same mask; and
4906 (b) the permutes only need a single vector input. */
4907 mask.new_vector (nunits, group_size, 3);
4908 nelts_to_build = mask.encoded_nelts ();
4909 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4910 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
4911 }
4912 else
4913 {
4914 /* We need to construct a separate mask for each vector statement. */
4915 unsigned HOST_WIDE_INT const_nunits, const_vf;
4916 if (!nunits.is_constant (&const_nunits)
4917 || !vf.is_constant (&const_vf))
4918 return false;
4919 mask.new_vector (const_nunits, const_nunits, 1);
4920 nelts_to_build = const_vf * group_size;
4921 nvectors_per_build = 1;
4922 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
4923 }
4924 auto_sbitmap used_in_lanes (in_nlanes);
4925 bitmap_clear (used_in_lanes);
4926
4927 unsigned int count = mask.encoded_nelts ();
4928 mask.quick_grow (count);
4929 vec_perm_indices indices;
4930
4931 for (unsigned int j = 0; j < nelts_to_build; j++)
4932 {
4933 unsigned int iter_num = j / group_size;
4934 unsigned int stmt_num = j % group_size;
4935 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4936 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4937 bitmap_set_bit (used_in_lanes, i);
4938 if (repeating_p)
4939 {
4940 first_vec_index = 0;
4941 mask_element = i;
4942 }
4943 else
4944 {
4945 /* Enforced before the loop when !repeating_p. */
4946 unsigned int const_nunits = nunits.to_constant ();
4947 vec_index = i / const_nunits;
4948 mask_element = i % const_nunits;
4949 if (vec_index == first_vec_index
4950 || first_vec_index == -1)
4951 {
4952 first_vec_index = vec_index;
4953 }
4954 else if (vec_index == second_vec_index
4955 || second_vec_index == -1)
4956 {
4957 second_vec_index = vec_index;
4958 mask_element += const_nunits;
4959 }
4960 else
4961 {
4962 if (dump_enabled_p ())
4963 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4964 "permutation requires at "
4965 "least three vectors %G",
4966 stmt_info->stmt);
4967 gcc_assert (analyze_only);
4968 return false;
4969 }
4970
4971 gcc_assert (mask_element < 2 * const_nunits);
4972 }
4973
4974 if (mask_element != index)
4975 noop_p = false;
4976 mask[index++] = mask_element;
4977
4978 if (index == count && !noop_p)
4979 {
4980 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4981 if (!can_vec_perm_const_p (mode, indices))
4982 {
4983 if (dump_enabled_p ())
4984 {
4985 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4986 vect_location,
4987 "unsupported vect permute { ");
4988 for (i = 0; i < count; ++i)
4989 {
4990 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4991 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4992 }
4993 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4994 }
4995 gcc_assert (analyze_only);
4996 return false;
4997 }
4998
4999 ++*n_perms;
5000 }
5001
5002 if (index == count)
5003 {
5004 if (!analyze_only)
5005 {
5006 tree mask_vec = NULL_TREE;
5007
5008 if (! noop_p)
5009 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5010
5011 if (second_vec_index == -1)
5012 second_vec_index = first_vec_index;
5013
5014 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5015 {
5016 /* Generate the permute statement if necessary. */
5017 tree first_vec = dr_chain[first_vec_index + ri];
5018 tree second_vec = dr_chain[second_vec_index + ri];
5019 gimple *perm_stmt;
5020 if (! noop_p)
5021 {
5022 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5023 tree perm_dest
5024 = vect_create_destination_var (gimple_assign_lhs (stmt),
5025 vectype);
5026 perm_dest = make_ssa_name (perm_dest);
5027 perm_stmt
5028 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5029 first_vec, second_vec,
5030 mask_vec);
5031 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5032 gsi);
5033 }
5034 else
5035 /* If mask was NULL_TREE generate the requested
5036 identity transform. */
5037 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5038
5039 /* Store the vector statement in NODE. */
5040 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5041 }
5042 }
5043
5044 index = 0;
5045 first_vec_index = -1;
5046 second_vec_index = -1;
5047 noop_p = true;
5048 }
5049 }
5050
5051 if (n_loads)
5052 {
5053 if (repeating_p)
5054 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5055 else
5056 {
5057 /* Enforced above when !repeating_p. */
5058 unsigned int const_nunits = nunits.to_constant ();
5059 *n_loads = 0;
5060 bool load_seen = false;
5061 for (unsigned i = 0; i < in_nlanes; ++i)
5062 {
5063 if (i % const_nunits == 0)
5064 {
5065 if (load_seen)
5066 *n_loads += 1;
5067 load_seen = false;
5068 }
5069 if (bitmap_bit_p (used_in_lanes, i))
5070 load_seen = true;
5071 }
5072 if (load_seen)
5073 *n_loads += 1;
5074 }
5075 }
5076
5077 return true;
5078 }
5079
5080
5081 /* Vectorize the SLP permutations in NODE as specified
5082 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5083 child number and lane number.
5084 Interleaving of two two-lane two-child SLP subtrees (not supported):
5085 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5086 A blend of two four-lane two-child SLP subtrees:
5087 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5088 Highpart of a four-lane one-child SLP subtree (not supported):
5089 [ { 0, 2 }, { 0, 3 } ]
5090 Where currently only a subset is supported by code generating below. */
5091
5092 static bool
5093 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5094 slp_tree node, stmt_vector_for_cost *cost_vec)
5095 {
5096 tree vectype = SLP_TREE_VECTYPE (node);
5097
5098 /* ??? We currently only support all same vector input and output types
5099 while the SLP IL should really do a concat + select and thus accept
5100 arbitrary mismatches. */
5101 slp_tree child;
5102 unsigned i;
5103 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5104 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5105 {
5106 if (dump_enabled_p ())
5107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5108 "Unsupported lane permutation\n");
5109 return false;
5110 }
5111
5112 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5113 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5114 if (dump_enabled_p ())
5115 {
5116 dump_printf_loc (MSG_NOTE, vect_location,
5117 "vectorizing permutation");
5118 for (unsigned i = 0; i < perm.length (); ++i)
5119 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5120 dump_printf (MSG_NOTE, "\n");
5121 }
5122
5123 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5124 if (!nunits.is_constant ())
5125 return false;
5126 unsigned HOST_WIDE_INT vf = 1;
5127 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5128 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5129 return false;
5130 unsigned olanes = vf * SLP_TREE_LANES (node);
5131 gcc_assert (multiple_p (olanes, nunits));
5132
5133 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5134 from the { SLP operand, scalar lane } permutation as recorded in the
5135 SLP node as intermediate step. This part should already work
5136 with SLP children with arbitrary number of lanes. */
5137 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5138 auto_vec<unsigned> active_lane;
5139 vperm.create (olanes);
5140 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5141 for (unsigned i = 0; i < vf; ++i)
5142 {
5143 for (unsigned pi = 0; pi < perm.length (); ++pi)
5144 {
5145 std::pair<unsigned, unsigned> p = perm[pi];
5146 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5147 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5148 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5149 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5150 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5151 }
5152 /* Advance to the next group. */
5153 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5154 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5155 }
5156
5157 if (dump_enabled_p ())
5158 {
5159 dump_printf_loc (MSG_NOTE, vect_location, "as");
5160 for (unsigned i = 0; i < vperm.length (); ++i)
5161 {
5162 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5163 dump_printf (MSG_NOTE, ",");
5164 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5165 vperm[i].first.first, vperm[i].first.second,
5166 vperm[i].first.second);
5167 }
5168 dump_printf (MSG_NOTE, "\n");
5169 }
5170
5171 /* We can only handle two-vector permutes, everything else should
5172 be lowered on the SLP level. The following is closely inspired
5173 by vect_transform_slp_perm_load and is supposed to eventually
5174 replace it.
5175 ??? As intermediate step do code-gen in the SLP tree representation
5176 somehow? */
5177 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5178 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5179 unsigned int const_nunits = nunits.to_constant ();
5180 unsigned int index = 0;
5181 unsigned int mask_element;
5182 vec_perm_builder mask;
5183 mask.new_vector (const_nunits, const_nunits, 1);
5184 unsigned int count = mask.encoded_nelts ();
5185 mask.quick_grow (count);
5186 vec_perm_indices indices;
5187 unsigned nperms = 0;
5188 for (unsigned i = 0; i < vperm.length (); ++i)
5189 {
5190 mask_element = vperm[i].second;
5191 if (first_vec.first == -1U
5192 || first_vec == vperm[i].first)
5193 first_vec = vperm[i].first;
5194 else if (second_vec.first == -1U
5195 || second_vec == vperm[i].first)
5196 {
5197 second_vec = vperm[i].first;
5198 mask_element += const_nunits;
5199 }
5200 else
5201 {
5202 if (dump_enabled_p ())
5203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5204 "permutation requires at "
5205 "least three vectors");
5206 gcc_assert (!gsi);
5207 return false;
5208 }
5209
5210 mask[index++] = mask_element;
5211
5212 if (index == count)
5213 {
5214 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5215 const_nunits);
5216 bool identity_p = indices.series_p (0, 1, 0, 1);
5217 if (!identity_p
5218 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5219 {
5220 if (dump_enabled_p ())
5221 {
5222 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5223 vect_location,
5224 "unsupported vect permute { ");
5225 for (i = 0; i < count; ++i)
5226 {
5227 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5228 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5229 }
5230 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5231 }
5232 gcc_assert (!gsi);
5233 return false;
5234 }
5235
5236 if (!identity_p)
5237 nperms++;
5238 if (gsi)
5239 {
5240 if (second_vec.first == -1U)
5241 second_vec = first_vec;
5242
5243 /* Generate the permute statement if necessary. */
5244 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5245 tree first_def
5246 = vect_get_slp_vect_def (first_node, first_vec.second);
5247 gassign *perm_stmt;
5248 tree perm_dest = make_ssa_name (vectype);
5249 if (!identity_p)
5250 {
5251 slp_tree second_node
5252 = SLP_TREE_CHILDREN (node)[second_vec.first];
5253 tree second_def
5254 = vect_get_slp_vect_def (second_node, second_vec.second);
5255 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5256 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5257 first_def, second_def,
5258 mask_vec);
5259 }
5260 else
5261 /* We need a copy here in case the def was external. */
5262 perm_stmt = gimple_build_assign (perm_dest, first_def);
5263 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5264 /* Store the vector statement in NODE. */
5265 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5266 }
5267
5268 index = 0;
5269 first_vec = std::make_pair (-1U, -1U);
5270 second_vec = std::make_pair (-1U, -1U);
5271 }
5272 }
5273
5274 if (!gsi)
5275 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5276
5277 return true;
5278 }
5279
5280 /* Vectorize SLP NODE. */
5281
5282 static void
5283 vect_schedule_slp_node (vec_info *vinfo,
5284 slp_tree node, slp_instance instance)
5285 {
5286 gimple_stmt_iterator si;
5287 int i;
5288 slp_tree child;
5289
5290 /* For existing vectors there's nothing to do. */
5291 if (SLP_TREE_VEC_DEFS (node).exists ())
5292 return;
5293
5294 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5295
5296 /* Vectorize externals and constants. */
5297 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5298 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5299 {
5300 /* ??? vectorizable_shift can end up using a scalar operand which is
5301 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5302 node in this case. */
5303 if (!SLP_TREE_VECTYPE (node))
5304 return;
5305
5306 vect_create_constant_vectors (vinfo, node);
5307 return;
5308 }
5309
5310 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5311
5312 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5313 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5314
5315 if (dump_enabled_p ())
5316 dump_printf_loc (MSG_NOTE, vect_location,
5317 "------>vectorizing SLP node starting from: %G",
5318 stmt_info->stmt);
5319
5320 if (STMT_VINFO_DATA_REF (stmt_info)
5321 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5322 {
5323 /* Vectorized loads go before the first scalar load to make it
5324 ready early, vectorized stores go before the last scalar
5325 stmt which is where all uses are ready. */
5326 stmt_vec_info last_stmt_info = NULL;
5327 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5328 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5329 else /* DR_IS_WRITE */
5330 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5331 si = gsi_for_stmt (last_stmt_info->stmt);
5332 }
5333 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5334 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5335 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5336 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5337 {
5338 /* For PHI node vectorization we do not use the insertion iterator. */
5339 si = gsi_none ();
5340 }
5341 else
5342 {
5343 /* Emit other stmts after the children vectorized defs which is
5344 earliest possible. */
5345 gimple *last_stmt = NULL;
5346 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5347 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5348 {
5349 /* For fold-left reductions we are retaining the scalar
5350 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5351 set so the representation isn't perfect. Resort to the
5352 last scalar def here. */
5353 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5354 {
5355 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5356 == cycle_phi_info_type);
5357 gphi *phi = as_a <gphi *>
5358 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5359 if (!last_stmt
5360 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5361 last_stmt = phi;
5362 }
5363 /* We are emitting all vectorized stmts in the same place and
5364 the last one is the last.
5365 ??? Unless we have a load permutation applied and that
5366 figures to re-use an earlier generated load. */
5367 unsigned j;
5368 gimple *vstmt;
5369 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5370 if (!last_stmt
5371 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5372 last_stmt = vstmt;
5373 }
5374 else if (!SLP_TREE_VECTYPE (child))
5375 {
5376 /* For externals we use unvectorized at all scalar defs. */
5377 unsigned j;
5378 tree def;
5379 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5380 if (TREE_CODE (def) == SSA_NAME
5381 && !SSA_NAME_IS_DEFAULT_DEF (def))
5382 {
5383 gimple *stmt = SSA_NAME_DEF_STMT (def);
5384 if (!last_stmt
5385 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5386 last_stmt = stmt;
5387 }
5388 }
5389 else
5390 {
5391 /* For externals we have to look at all defs since their
5392 insertion place is decided per vector. But beware
5393 of pre-existing vectors where we need to make sure
5394 we do not insert before the region boundary. */
5395 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5396 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5397 last_stmt = gsi_stmt (gsi_after_labels
5398 (as_a <bb_vec_info> (vinfo)->bbs[0]));
5399 else
5400 {
5401 unsigned j;
5402 tree vdef;
5403 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5404 if (TREE_CODE (vdef) == SSA_NAME
5405 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5406 {
5407 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5408 if (!last_stmt
5409 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5410 last_stmt = vstmt;
5411 }
5412 }
5413 }
5414 /* This can happen when all children are pre-existing vectors or
5415 constants. */
5416 if (!last_stmt)
5417 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5418 if (is_a <gphi *> (last_stmt))
5419 si = gsi_after_labels (gimple_bb (last_stmt));
5420 else
5421 {
5422 si = gsi_for_stmt (last_stmt);
5423 gsi_next (&si);
5424 }
5425 }
5426
5427 bool done_p = false;
5428
5429 /* Handle purely internal nodes. */
5430 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5431 {
5432 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5433 be shared with different SLP nodes (but usually it's the same
5434 operation apart from the case the stmt is only there for denoting
5435 the actual scalar lane defs ...). So do not call vect_transform_stmt
5436 but open-code it here (partly). */
5437 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
5438 gcc_assert (done);
5439 done_p = true;
5440 }
5441 if (!done_p)
5442 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
5443 }
5444
5445 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5446 For loop vectorization this is done in vectorizable_call, but for SLP
5447 it needs to be deferred until end of vect_schedule_slp, because multiple
5448 SLP instances may refer to the same scalar stmt. */
5449
5450 static void
5451 vect_remove_slp_scalar_calls (vec_info *vinfo,
5452 slp_tree node, hash_set<slp_tree> &visited)
5453 {
5454 gimple *new_stmt;
5455 gimple_stmt_iterator gsi;
5456 int i;
5457 slp_tree child;
5458 tree lhs;
5459 stmt_vec_info stmt_info;
5460
5461 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5462 return;
5463
5464 if (visited.add (node))
5465 return;
5466
5467 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5468 vect_remove_slp_scalar_calls (vinfo, child, visited);
5469
5470 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5471 {
5472 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
5473 if (!stmt || gimple_bb (stmt) == NULL)
5474 continue;
5475 if (is_pattern_stmt_p (stmt_info)
5476 || !PURE_SLP_STMT (stmt_info))
5477 continue;
5478 lhs = gimple_call_lhs (stmt);
5479 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
5480 gsi = gsi_for_stmt (stmt);
5481 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
5482 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
5483 }
5484 }
5485
5486 static void
5487 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
5488 {
5489 hash_set<slp_tree> visited;
5490 vect_remove_slp_scalar_calls (vinfo, node, visited);
5491 }
5492
5493 /* Vectorize the instance root. */
5494
5495 void
5496 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
5497 {
5498 gassign *rstmt = NULL;
5499
5500 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
5501 {
5502 gimple *child_stmt;
5503 int j;
5504
5505 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5506 {
5507 tree vect_lhs = gimple_get_lhs (child_stmt);
5508 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
5509 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
5510 TREE_TYPE (vect_lhs)))
5511 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
5512 vect_lhs);
5513 rstmt = gimple_build_assign (root_lhs, vect_lhs);
5514 break;
5515 }
5516 }
5517 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
5518 {
5519 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5520 gimple *child_stmt;
5521 int j;
5522 vec<constructor_elt, va_gc> *v;
5523 vec_alloc (v, nelts);
5524
5525 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
5526 {
5527 CONSTRUCTOR_APPEND_ELT (v,
5528 NULL_TREE,
5529 gimple_get_lhs (child_stmt));
5530 }
5531 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
5532 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
5533 tree r_constructor = build_constructor (rtype, v);
5534 rstmt = gimple_build_assign (lhs, r_constructor);
5535 }
5536
5537 gcc_assert (rstmt);
5538
5539 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
5540 gsi_replace (&rgsi, rstmt, true);
5541 }
5542
5543 struct slp_scc_info
5544 {
5545 bool on_stack;
5546 int dfs;
5547 int lowlink;
5548 };
5549
5550 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5551
5552 static void
5553 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
5554 hash_map<slp_tree, slp_scc_info> &scc_info,
5555 int &maxdfs, vec<slp_tree> &stack)
5556 {
5557 bool existed_p;
5558 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
5559 gcc_assert (!existed_p);
5560 info->dfs = maxdfs;
5561 info->lowlink = maxdfs;
5562 maxdfs++;
5563
5564 /* Leaf. */
5565 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
5566 {
5567 info->on_stack = false;
5568 vect_schedule_slp_node (vinfo, node, instance);
5569 return;
5570 }
5571
5572 info->on_stack = true;
5573 stack.safe_push (node);
5574
5575 unsigned i;
5576 slp_tree child;
5577 /* DFS recurse. */
5578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5579 {
5580 if (!child)
5581 continue;
5582 slp_scc_info *child_info = scc_info.get (child);
5583 if (!child_info)
5584 {
5585 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
5586 /* Recursion might have re-allocated the node. */
5587 info = scc_info.get (node);
5588 child_info = scc_info.get (child);
5589 info->lowlink = MIN (info->lowlink, child_info->lowlink);
5590 }
5591 else if (child_info->on_stack)
5592 info->lowlink = MIN (info->lowlink, child_info->dfs);
5593 }
5594 if (info->lowlink != info->dfs)
5595 return;
5596
5597 auto_vec<slp_tree, 4> phis_to_fixup;
5598
5599 /* Singleton. */
5600 if (stack.last () == node)
5601 {
5602 stack.pop ();
5603 info->on_stack = false;
5604 vect_schedule_slp_node (vinfo, node, instance);
5605 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
5606 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
5607 phis_to_fixup.quick_push (node);
5608 }
5609 else
5610 {
5611 /* SCC. */
5612 int last_idx = stack.length () - 1;
5613 while (stack[last_idx] != node)
5614 last_idx--;
5615 /* We can break the cycle at PHIs who have at least one child
5616 code generated. Then we could re-start the DFS walk until
5617 all nodes in the SCC are covered (we might have new entries
5618 for only back-reachable nodes). But it's simpler to just
5619 iterate and schedule those that are ready. */
5620 unsigned todo = stack.length () - last_idx;
5621 do
5622 {
5623 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
5624 {
5625 slp_tree entry = stack[idx];
5626 if (!entry)
5627 continue;
5628 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
5629 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
5630 bool ready = !phi;
5631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
5632 if (!child)
5633 {
5634 gcc_assert (phi);
5635 ready = true;
5636 break;
5637 }
5638 else if (scc_info.get (child)->on_stack)
5639 {
5640 if (!phi)
5641 {
5642 ready = false;
5643 break;
5644 }
5645 }
5646 else
5647 {
5648 if (phi)
5649 {
5650 ready = true;
5651 break;
5652 }
5653 }
5654 if (ready)
5655 {
5656 vect_schedule_slp_node (vinfo, entry, instance);
5657 scc_info.get (entry)->on_stack = false;
5658 stack[idx] = NULL;
5659 todo--;
5660 if (phi)
5661 phis_to_fixup.safe_push (entry);
5662 }
5663 }
5664 }
5665 while (todo != 0);
5666
5667 /* Pop the SCC. */
5668 stack.truncate (last_idx);
5669 }
5670
5671 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5672 slp_tree phi_node;
5673 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
5674 {
5675 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
5676 edge_iterator ei;
5677 edge e;
5678 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
5679 {
5680 unsigned dest_idx = e->dest_idx;
5681 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
5682 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
5683 continue;
5684 /* Simply fill all args. */
5685 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
5686 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
5687 vect_get_slp_vect_def (child, i),
5688 e, gimple_phi_arg_location (phi, dest_idx));
5689 }
5690 }
5691 }
5692
5693 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5694
5695 void
5696 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
5697 {
5698 slp_instance instance;
5699 unsigned int i;
5700
5701 hash_map<slp_tree, slp_scc_info> scc_info;
5702 int maxdfs = 0;
5703 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5704 {
5705 slp_tree node = SLP_INSTANCE_TREE (instance);
5706 if (dump_enabled_p ())
5707 {
5708 dump_printf_loc (MSG_NOTE, vect_location,
5709 "Vectorizing SLP tree:\n");
5710 if (SLP_INSTANCE_ROOT_STMT (instance))
5711 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
5712 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
5713 vect_print_slp_graph (MSG_NOTE, vect_location,
5714 SLP_INSTANCE_TREE (instance));
5715 }
5716 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5717 have a PHI be the node breaking the cycle. */
5718 auto_vec<slp_tree> stack;
5719 if (!scc_info.get (node))
5720 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
5721
5722 if (SLP_INSTANCE_ROOT_STMT (instance))
5723 vectorize_slp_instance_root_stmt (node, instance);
5724
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_NOTE, vect_location,
5727 "vectorizing stmts using SLP.\n");
5728 }
5729
5730 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5731 {
5732 slp_tree root = SLP_INSTANCE_TREE (instance);
5733 stmt_vec_info store_info;
5734 unsigned int j;
5735
5736 /* Remove scalar call stmts. Do not do this for basic-block
5737 vectorization as not all uses may be vectorized.
5738 ??? Why should this be necessary? DCE should be able to
5739 remove the stmts itself.
5740 ??? For BB vectorization we can as well remove scalar
5741 stmts starting from the SLP tree root if they have no
5742 uses. */
5743 if (is_a <loop_vec_info> (vinfo))
5744 vect_remove_slp_scalar_calls (vinfo, root);
5745
5746 /* Remove vectorized stores original scalar stmts. */
5747 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
5748 {
5749 if (!STMT_VINFO_DATA_REF (store_info)
5750 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
5751 break;
5752
5753 store_info = vect_orig_stmt (store_info);
5754 /* Free the attached stmt_vec_info and remove the stmt. */
5755 vinfo->remove_stmt (store_info);
5756 }
5757 }
5758 }