tree-optimization/95916 - treat scalar ops explicitely
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
52
53 /* Initialize a SLP node. */
54
55 _slp_tree::_slp_tree ()
56 {
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
72 }
73
74 /* Tear down a SLP node. */
75
76 _slp_tree::~_slp_tree ()
77 {
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
85 }
86
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
90
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
93 {
94 int i;
95 slp_tree child;
96
97 if (--node->refcnt != 0)
98 return;
99
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
102
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
108 {
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
111 {
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
114 }
115 }
116
117 delete node;
118 }
119
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
123
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
126 {
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 free (instance);
130 }
131
132
133 /* Create an SLP node for SCALAR_STMTS. */
134
135 static slp_tree
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
137 {
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
144
145 unsigned i;
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
149
150 return node;
151 }
152
153 /* Create an SLP node for OPS. */
154
155 static slp_tree
156 vect_create_new_slp_node (vec<tree> ops)
157 {
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
182
183
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
188 {
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
192
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
195 {
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
203 }
204
205 return oprnds_info;
206 }
207
208
209 /* Free operands info. */
210
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 {
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
222 }
223
224 oprnds_info.release ();
225 }
226
227
228 /* Return true if STMTS contains a pattern statement. */
229
230 static bool
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 {
233 stmt_vec_info stmt_info;
234 unsigned int i;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
237 return true;
238 return false;
239 }
240
241 /* Return true when all lanes in the external or constant NODE have
242 the same value. */
243
244 static bool
245 vect_slp_tree_uniform_p (slp_tree node)
246 {
247 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
249
250 unsigned i;
251 tree op, first = NULL_TREE;
252 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
253 if (!first)
254 first = op;
255 else if (!operand_equal_p (first, op, 0))
256 return false;
257
258 return true;
259 }
260
261 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
262 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
263 of the chain. */
264
265 int
266 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
267 stmt_vec_info first_stmt_info)
268 {
269 stmt_vec_info next_stmt_info = first_stmt_info;
270 int result = 0;
271
272 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
273 return -1;
274
275 do
276 {
277 if (next_stmt_info == stmt_info)
278 return result;
279 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
280 if (next_stmt_info)
281 result += DR_GROUP_GAP (next_stmt_info);
282 }
283 while (next_stmt_info);
284
285 return -1;
286 }
287
288 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
289 using the method implemented by duplicate_and_interleave. Return true
290 if so, returning the number of intermediate vectors in *NVECTORS_OUT
291 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
292 (if nonnull). */
293
294 bool
295 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
296 tree elt_type, unsigned int *nvectors_out,
297 tree *vector_type_out,
298 tree *permutes)
299 {
300 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
301 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
302 return false;
303
304 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
305 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
306 unsigned int nvectors = 1;
307 for (;;)
308 {
309 scalar_int_mode int_mode;
310 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
311 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
312 {
313 /* Get the natural vector type for this SLP group size. */
314 tree int_type = build_nonstandard_integer_type
315 (GET_MODE_BITSIZE (int_mode), 1);
316 tree vector_type
317 = get_vectype_for_scalar_type (vinfo, int_type, count);
318 if (vector_type
319 && VECTOR_MODE_P (TYPE_MODE (vector_type))
320 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
321 GET_MODE_SIZE (base_vector_mode)))
322 {
323 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
324 together into elements of type INT_TYPE and using the result
325 to build NVECTORS vectors. */
326 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
327 vec_perm_builder sel1 (nelts, 2, 3);
328 vec_perm_builder sel2 (nelts, 2, 3);
329 poly_int64 half_nelts = exact_div (nelts, 2);
330 for (unsigned int i = 0; i < 3; ++i)
331 {
332 sel1.quick_push (i);
333 sel1.quick_push (i + nelts);
334 sel2.quick_push (half_nelts + i);
335 sel2.quick_push (half_nelts + i + nelts);
336 }
337 vec_perm_indices indices1 (sel1, 2, nelts);
338 vec_perm_indices indices2 (sel2, 2, nelts);
339 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
340 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
341 {
342 if (nvectors_out)
343 *nvectors_out = nvectors;
344 if (vector_type_out)
345 *vector_type_out = vector_type;
346 if (permutes)
347 {
348 permutes[0] = vect_gen_perm_mask_checked (vector_type,
349 indices1);
350 permutes[1] = vect_gen_perm_mask_checked (vector_type,
351 indices2);
352 }
353 return true;
354 }
355 }
356 }
357 if (!multiple_p (elt_bytes, 2, &elt_bytes))
358 return false;
359 nvectors *= 2;
360 }
361 }
362
363 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
364 they are of a valid type and that they match the defs of the first stmt of
365 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
366 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
367 indicates swap is required for cond_expr stmts. Specifically, *SWAP
368 is 1 if STMT is cond and operands of comparison need to be swapped;
369 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
370 If there is any operand swap in this function, *SWAP is set to non-zero
371 value.
372 If there was a fatal error return -1; if the error could be corrected by
373 swapping operands of father node of this one, return 1; if everything is
374 ok return 0. */
375 static int
376 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
377 vec<stmt_vec_info> stmts, unsigned stmt_num,
378 vec<slp_oprnd_info> *oprnds_info)
379 {
380 stmt_vec_info stmt_info = stmts[stmt_num];
381 tree oprnd;
382 unsigned int i, number_of_oprnds;
383 enum vect_def_type dt = vect_uninitialized_def;
384 slp_oprnd_info oprnd_info;
385 int first_op_idx = 1;
386 unsigned int commutative_op = -1U;
387 bool first_op_cond = false;
388 bool first = stmt_num == 0;
389
390 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
391 {
392 number_of_oprnds = gimple_call_num_args (stmt);
393 first_op_idx = 3;
394 if (gimple_call_internal_p (stmt))
395 {
396 internal_fn ifn = gimple_call_internal_fn (stmt);
397 commutative_op = first_commutative_argument (ifn);
398
399 /* Masked load, only look at mask. */
400 if (ifn == IFN_MASK_LOAD)
401 {
402 number_of_oprnds = 1;
403 /* Mask operand index. */
404 first_op_idx = 5;
405 }
406 }
407 }
408 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
409 {
410 enum tree_code code = gimple_assign_rhs_code (stmt);
411 number_of_oprnds = gimple_num_ops (stmt) - 1;
412 /* Swap can only be done for cond_expr if asked to, otherwise we
413 could result in different comparison code to the first stmt. */
414 if (code == COND_EXPR
415 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
416 {
417 first_op_cond = true;
418 number_of_oprnds++;
419 }
420 else
421 commutative_op = commutative_tree_code (code) ? 0U : -1U;
422 }
423 else
424 return -1;
425
426 bool swapped = (*swap != 0);
427 gcc_assert (!swapped || first_op_cond);
428 for (i = 0; i < number_of_oprnds; i++)
429 {
430 again:
431 if (first_op_cond)
432 {
433 /* Map indicating how operands of cond_expr should be swapped. */
434 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
435 int *map = maps[*swap];
436
437 if (i < 2)
438 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
439 first_op_idx), map[i]);
440 else
441 oprnd = gimple_op (stmt_info->stmt, map[i]);
442 }
443 else
444 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
445 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
446 oprnd = TREE_OPERAND (oprnd, 0);
447
448 oprnd_info = (*oprnds_info)[i];
449
450 stmt_vec_info def_stmt_info;
451 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
452 {
453 if (dump_enabled_p ())
454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 "Build SLP failed: can't analyze def for %T\n",
456 oprnd);
457
458 return -1;
459 }
460
461 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
462 oprnd_info->any_pattern = true;
463
464 if (first)
465 {
466 /* For the swapping logic below force vect_reduction_def
467 for the reduction op in a SLP reduction group. */
468 if (!STMT_VINFO_DATA_REF (stmt_info)
469 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
470 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
471 && def_stmt_info)
472 dt = vect_reduction_def;
473 oprnd_info->first_dt = dt;
474 oprnd_info->first_op_type = TREE_TYPE (oprnd);
475 }
476 else
477 {
478 /* Not first stmt of the group, check that the def-stmt/s match
479 the def-stmt/s of the first stmt. Allow different definition
480 types for reduction chains: the first stmt must be a
481 vect_reduction_def (a phi node), and the rest
482 end in the reduction chain. */
483 tree type = TREE_TYPE (oprnd);
484 if ((oprnd_info->first_dt != dt
485 && !(oprnd_info->first_dt == vect_reduction_def
486 && !STMT_VINFO_DATA_REF (stmt_info)
487 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
488 && def_stmt_info
489 && !STMT_VINFO_DATA_REF (def_stmt_info)
490 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
491 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
492 && !((oprnd_info->first_dt == vect_external_def
493 || oprnd_info->first_dt == vect_constant_def)
494 && (dt == vect_external_def
495 || dt == vect_constant_def)))
496 || !types_compatible_p (oprnd_info->first_op_type, type)
497 || (!STMT_VINFO_DATA_REF (stmt_info)
498 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
499 && ((!def_stmt_info
500 || STMT_VINFO_DATA_REF (def_stmt_info)
501 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
502 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
503 != (oprnd_info->first_dt != vect_reduction_def))))
504 {
505 /* Try swapping operands if we got a mismatch. */
506 if (i == commutative_op && !swapped)
507 {
508 if (dump_enabled_p ())
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "trying swapped operands\n");
511 swapped = true;
512 goto again;
513 }
514
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
517 "Build SLP failed: different types\n");
518
519 return 1;
520 }
521 if ((dt == vect_constant_def
522 || dt == vect_external_def)
523 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
524 && (TREE_CODE (type) == BOOLEAN_TYPE
525 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
526 type)))
527 {
528 if (dump_enabled_p ())
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "Build SLP failed: invalid type of def "
531 "for variable-length SLP %T\n", oprnd);
532 return -1;
533 }
534 }
535
536 /* Check the types of the definitions. */
537 switch (dt)
538 {
539 case vect_external_def:
540 /* Make sure to demote the overall operand to external. */
541 oprnd_info->first_dt = vect_external_def;
542 /* Fallthru. */
543 case vect_constant_def:
544 oprnd_info->def_stmts.quick_push (NULL);
545 oprnd_info->ops.quick_push (oprnd);
546 break;
547
548 case vect_internal_def:
549 case vect_reduction_def:
550 if (oprnd_info->first_dt == vect_reduction_def
551 && !STMT_VINFO_DATA_REF (stmt_info)
552 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
553 && !STMT_VINFO_DATA_REF (def_stmt_info)
554 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
555 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
556 {
557 /* For a SLP reduction chain we want to duplicate the
558 reduction to each of the chain members. That gets
559 us a sane SLP graph (still the stmts are not 100%
560 correct wrt the initial values). */
561 gcc_assert (!first);
562 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
563 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
564 break;
565 }
566 /* Fallthru. */
567 case vect_induction_def:
568 oprnd_info->def_stmts.quick_push (def_stmt_info);
569 oprnd_info->ops.quick_push (oprnd);
570 break;
571
572 default:
573 /* FORNOW: Not supported. */
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "Build SLP failed: illegal type of def %T\n",
577 oprnd);
578
579 return -1;
580 }
581 }
582
583 /* Swap operands. */
584 if (swapped)
585 {
586 if (dump_enabled_p ())
587 dump_printf_loc (MSG_NOTE, vect_location,
588 "swapped operands to match def types in %G",
589 stmt_info->stmt);
590 }
591
592 *swap = swapped;
593 return 0;
594 }
595
596 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
597 Return true if we can, meaning that this choice doesn't conflict with
598 existing SLP nodes that use STMT_INFO. */
599
600 static bool
601 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
602 {
603 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
604 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
605 return true;
606
607 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
608 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
609 {
610 /* We maintain the invariant that if any statement in the group is
611 used, all other members of the group have the same vector type. */
612 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
613 stmt_vec_info member_info = first_info;
614 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
615 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
616 || is_pattern_stmt_p (member_info))
617 break;
618
619 if (!member_info)
620 {
621 for (member_info = first_info; member_info;
622 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
623 STMT_VINFO_VECTYPE (member_info) = vectype;
624 return true;
625 }
626 }
627 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
628 && !is_pattern_stmt_p (stmt_info))
629 {
630 STMT_VINFO_VECTYPE (stmt_info) = vectype;
631 return true;
632 }
633
634 if (dump_enabled_p ())
635 {
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "Build SLP failed: incompatible vector"
638 " types for: %G", stmt_info->stmt);
639 dump_printf_loc (MSG_NOTE, vect_location,
640 " old vector type: %T\n", old_vectype);
641 dump_printf_loc (MSG_NOTE, vect_location,
642 " new vector type: %T\n", vectype);
643 }
644 return false;
645 }
646
647 /* Return true if call statements CALL1 and CALL2 are similar enough
648 to be combined into the same SLP group. */
649
650 static bool
651 compatible_calls_p (gcall *call1, gcall *call2)
652 {
653 unsigned int nargs = gimple_call_num_args (call1);
654 if (nargs != gimple_call_num_args (call2))
655 return false;
656
657 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
658 return false;
659
660 if (gimple_call_internal_p (call1))
661 {
662 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
663 TREE_TYPE (gimple_call_lhs (call2))))
664 return false;
665 for (unsigned int i = 0; i < nargs; ++i)
666 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
667 TREE_TYPE (gimple_call_arg (call2, i))))
668 return false;
669 }
670 else
671 {
672 if (!operand_equal_p (gimple_call_fn (call1),
673 gimple_call_fn (call2), 0))
674 return false;
675
676 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
677 return false;
678 }
679 return true;
680 }
681
682 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
683 caller's attempt to find the vector type in STMT_INFO with the narrowest
684 element type. Return true if VECTYPE is nonnull and if it is valid
685 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
686 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
687 vect_build_slp_tree. */
688
689 static bool
690 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
691 unsigned int group_size,
692 tree vectype, poly_uint64 *max_nunits)
693 {
694 if (!vectype)
695 {
696 if (dump_enabled_p ())
697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
698 "Build SLP failed: unsupported data-type in %G\n",
699 stmt_info->stmt);
700 /* Fatal mismatch. */
701 return false;
702 }
703
704 /* If populating the vector type requires unrolling then fail
705 before adjusting *max_nunits for basic-block vectorization. */
706 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
707 unsigned HOST_WIDE_INT const_nunits;
708 if (is_a <bb_vec_info> (vinfo)
709 && (!nunits.is_constant (&const_nunits)
710 || const_nunits > group_size))
711 {
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: unrolling required "
715 "in basic block SLP\n");
716 /* Fatal mismatch. */
717 return false;
718 }
719
720 /* In case of multiple types we need to detect the smallest type. */
721 vect_update_max_nunits (max_nunits, vectype);
722 return true;
723 }
724
725 /* Verify if the scalar stmts STMTS are isomorphic, require data
726 permutation or are of unsupported types of operation. Return
727 true if they are, otherwise return false and indicate in *MATCHES
728 which stmts are not isomorphic to the first one. If MATCHES[0]
729 is false then this indicates the comparison could not be
730 carried out or the stmts will never be vectorized by SLP.
731
732 Note COND_EXPR is possibly isomorphic to another one after swapping its
733 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
734 the first stmt by swapping the two operands of comparison; set SWAP[i]
735 to 2 if stmt I is isormorphic to the first stmt by inverting the code
736 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
737 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
738
739 static bool
740 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
741 vec<stmt_vec_info> stmts, unsigned int group_size,
742 poly_uint64 *max_nunits, bool *matches,
743 bool *two_operators, tree *node_vectype)
744 {
745 unsigned int i;
746 stmt_vec_info first_stmt_info = stmts[0];
747 enum tree_code first_stmt_code = ERROR_MARK;
748 enum tree_code alt_stmt_code = ERROR_MARK;
749 enum tree_code rhs_code = ERROR_MARK;
750 enum tree_code first_cond_code = ERROR_MARK;
751 tree lhs;
752 bool need_same_oprnds = false;
753 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
754 optab optab;
755 int icode;
756 machine_mode optab_op2_mode;
757 machine_mode vec_mode;
758 stmt_vec_info first_load = NULL, prev_first_load = NULL;
759 bool load_p = false;
760
761 /* For every stmt in NODE find its def stmt/s. */
762 stmt_vec_info stmt_info;
763 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
764 {
765 gimple *stmt = stmt_info->stmt;
766 swap[i] = 0;
767 matches[i] = false;
768
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
771
772 /* Fail to vectorize statements marked as unvectorizable. */
773 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
774 {
775 if (dump_enabled_p ())
776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
777 "Build SLP failed: unvectorizable statement %G",
778 stmt);
779 /* Fatal mismatch. */
780 matches[0] = false;
781 return false;
782 }
783
784 lhs = gimple_get_lhs (stmt);
785 if (lhs == NULL_TREE)
786 {
787 if (dump_enabled_p ())
788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
789 "Build SLP failed: not GIMPLE_ASSIGN nor "
790 "GIMPLE_CALL %G", stmt);
791 /* Fatal mismatch. */
792 matches[0] = false;
793 return false;
794 }
795
796 tree nunits_vectype;
797 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
798 &nunits_vectype, group_size)
799 || (nunits_vectype
800 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
801 nunits_vectype, max_nunits)))
802 {
803 /* Fatal mismatch. */
804 matches[0] = false;
805 return false;
806 }
807
808 gcc_assert (vectype);
809
810 if (is_a <bb_vec_info> (vinfo)
811 && !vect_update_shared_vectype (stmt_info, vectype))
812 continue;
813
814 gcall *call_stmt = dyn_cast <gcall *> (stmt);
815 if (call_stmt)
816 {
817 rhs_code = CALL_EXPR;
818
819 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
820 load_p = true;
821 else if ((gimple_call_internal_p (call_stmt)
822 && (!vectorizable_internal_fn_p
823 (gimple_call_internal_fn (call_stmt))))
824 || gimple_call_tail_p (call_stmt)
825 || gimple_call_noreturn_p (call_stmt)
826 || !gimple_call_nothrow_p (call_stmt)
827 || gimple_call_chain (call_stmt))
828 {
829 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
831 "Build SLP failed: unsupported call type %G",
832 call_stmt);
833 /* Fatal mismatch. */
834 matches[0] = false;
835 return false;
836 }
837 }
838 else
839 {
840 rhs_code = gimple_assign_rhs_code (stmt);
841 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
842 }
843
844 /* Check the operation. */
845 if (i == 0)
846 {
847 *node_vectype = vectype;
848 first_stmt_code = rhs_code;
849
850 /* Shift arguments should be equal in all the packed stmts for a
851 vector shift with scalar shift operand. */
852 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
853 || rhs_code == LROTATE_EXPR
854 || rhs_code == RROTATE_EXPR)
855 {
856 vec_mode = TYPE_MODE (vectype);
857
858 /* First see if we have a vector/vector shift. */
859 optab = optab_for_tree_code (rhs_code, vectype,
860 optab_vector);
861
862 if (!optab
863 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
864 {
865 /* No vector/vector shift, try for a vector/scalar shift. */
866 optab = optab_for_tree_code (rhs_code, vectype,
867 optab_scalar);
868
869 if (!optab)
870 {
871 if (dump_enabled_p ())
872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
873 "Build SLP failed: no optab.\n");
874 /* Fatal mismatch. */
875 matches[0] = false;
876 return false;
877 }
878 icode = (int) optab_handler (optab, vec_mode);
879 if (icode == CODE_FOR_nothing)
880 {
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
883 "Build SLP failed: "
884 "op not supported by target.\n");
885 /* Fatal mismatch. */
886 matches[0] = false;
887 return false;
888 }
889 optab_op2_mode = insn_data[icode].operand[2].mode;
890 if (!VECTOR_MODE_P (optab_op2_mode))
891 {
892 need_same_oprnds = true;
893 first_op1 = gimple_assign_rhs2 (stmt);
894 }
895 }
896 }
897 else if (rhs_code == WIDEN_LSHIFT_EXPR)
898 {
899 need_same_oprnds = true;
900 first_op1 = gimple_assign_rhs2 (stmt);
901 }
902 else if (call_stmt
903 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
904 {
905 need_same_oprnds = true;
906 first_op1 = gimple_call_arg (call_stmt, 1);
907 }
908 }
909 else
910 {
911 if (first_stmt_code != rhs_code
912 && alt_stmt_code == ERROR_MARK)
913 alt_stmt_code = rhs_code;
914 if (first_stmt_code != rhs_code
915 && (first_stmt_code != IMAGPART_EXPR
916 || rhs_code != REALPART_EXPR)
917 && (first_stmt_code != REALPART_EXPR
918 || rhs_code != IMAGPART_EXPR)
919 /* Handle mismatches in plus/minus by computing both
920 and merging the results. */
921 && !((first_stmt_code == PLUS_EXPR
922 || first_stmt_code == MINUS_EXPR)
923 && (alt_stmt_code == PLUS_EXPR
924 || alt_stmt_code == MINUS_EXPR)
925 && rhs_code == alt_stmt_code)
926 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
927 && (first_stmt_code == ARRAY_REF
928 || first_stmt_code == BIT_FIELD_REF
929 || first_stmt_code == INDIRECT_REF
930 || first_stmt_code == COMPONENT_REF
931 || first_stmt_code == MEM_REF)))
932 {
933 if (dump_enabled_p ())
934 {
935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
936 "Build SLP failed: different operation "
937 "in stmt %G", stmt);
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "original stmt %G", first_stmt_info->stmt);
940 }
941 /* Mismatch. */
942 continue;
943 }
944
945 if (need_same_oprnds)
946 {
947 tree other_op1 = (call_stmt
948 ? gimple_call_arg (call_stmt, 1)
949 : gimple_assign_rhs2 (stmt));
950 if (!operand_equal_p (first_op1, other_op1, 0))
951 {
952 if (dump_enabled_p ())
953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954 "Build SLP failed: different shift "
955 "arguments in %G", stmt);
956 /* Mismatch. */
957 continue;
958 }
959 }
960
961 if (!load_p && rhs_code == CALL_EXPR)
962 {
963 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
964 as_a <gcall *> (stmt)))
965 {
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Build SLP failed: different calls in %G",
969 stmt);
970 /* Mismatch. */
971 continue;
972 }
973 }
974 }
975
976 /* Grouped store or load. */
977 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
978 {
979 if (REFERENCE_CLASS_P (lhs))
980 {
981 /* Store. */
982 ;
983 }
984 else
985 {
986 /* Load. */
987 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
988 if (prev_first_load)
989 {
990 /* Check that there are no loads from different interleaving
991 chains in the same node. */
992 if (prev_first_load != first_load)
993 {
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
996 vect_location,
997 "Build SLP failed: different "
998 "interleaving chains in one node %G",
999 stmt);
1000 /* Mismatch. */
1001 continue;
1002 }
1003 }
1004 else
1005 prev_first_load = first_load;
1006 }
1007 } /* Grouped access. */
1008 else
1009 {
1010 if (load_p)
1011 {
1012 /* Not grouped load. */
1013 if (dump_enabled_p ())
1014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1015 "Build SLP failed: not grouped load %G", stmt);
1016
1017 /* FORNOW: Not grouped loads are not supported. */
1018 /* Fatal mismatch. */
1019 matches[0] = false;
1020 return false;
1021 }
1022
1023 /* Not memory operation. */
1024 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1025 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1026 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1027 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1028 && rhs_code != VIEW_CONVERT_EXPR
1029 && rhs_code != CALL_EXPR)
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033 "Build SLP failed: operation unsupported %G",
1034 stmt);
1035 /* Fatal mismatch. */
1036 matches[0] = false;
1037 return false;
1038 }
1039
1040 if (rhs_code == COND_EXPR)
1041 {
1042 tree cond_expr = gimple_assign_rhs1 (stmt);
1043 enum tree_code cond_code = TREE_CODE (cond_expr);
1044 enum tree_code swap_code = ERROR_MARK;
1045 enum tree_code invert_code = ERROR_MARK;
1046
1047 if (i == 0)
1048 first_cond_code = TREE_CODE (cond_expr);
1049 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1050 {
1051 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1052 swap_code = swap_tree_comparison (cond_code);
1053 invert_code = invert_tree_comparison (cond_code, honor_nans);
1054 }
1055
1056 if (first_cond_code == cond_code)
1057 ;
1058 /* Isomorphic can be achieved by swapping. */
1059 else if (first_cond_code == swap_code)
1060 swap[i] = 1;
1061 /* Isomorphic can be achieved by inverting. */
1062 else if (first_cond_code == invert_code)
1063 swap[i] = 2;
1064 else
1065 {
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "Build SLP failed: different"
1069 " operation %G", stmt);
1070 /* Mismatch. */
1071 continue;
1072 }
1073 }
1074 }
1075
1076 matches[i] = true;
1077 }
1078
1079 for (i = 0; i < group_size; ++i)
1080 if (!matches[i])
1081 return false;
1082
1083 /* If we allowed a two-operation SLP node verify the target can cope
1084 with the permute we are going to use. */
1085 if (alt_stmt_code != ERROR_MARK
1086 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1087 {
1088 *two_operators = true;
1089 }
1090
1091 return true;
1092 }
1093
1094 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1095 Note we never remove apart from at destruction time so we do not
1096 need a special value for deleted that differs from empty. */
1097 struct bst_traits
1098 {
1099 typedef vec <stmt_vec_info> value_type;
1100 typedef vec <stmt_vec_info> compare_type;
1101 static inline hashval_t hash (value_type);
1102 static inline bool equal (value_type existing, value_type candidate);
1103 static inline bool is_empty (value_type x) { return !x.exists (); }
1104 static inline bool is_deleted (value_type x) { return !x.exists (); }
1105 static const bool empty_zero_p = true;
1106 static inline void mark_empty (value_type &x) { x.release (); }
1107 static inline void mark_deleted (value_type &x) { x.release (); }
1108 static inline void remove (value_type &x) { x.release (); }
1109 };
1110 inline hashval_t
1111 bst_traits::hash (value_type x)
1112 {
1113 inchash::hash h;
1114 for (unsigned i = 0; i < x.length (); ++i)
1115 h.add_int (gimple_uid (x[i]->stmt));
1116 return h.end ();
1117 }
1118 inline bool
1119 bst_traits::equal (value_type existing, value_type candidate)
1120 {
1121 if (existing.length () != candidate.length ())
1122 return false;
1123 for (unsigned i = 0; i < existing.length (); ++i)
1124 if (existing[i] != candidate[i])
1125 return false;
1126 return true;
1127 }
1128
1129 typedef hash_map <vec <gimple *>, slp_tree,
1130 simple_hashmap_traits <bst_traits, slp_tree> >
1131 scalar_stmts_to_slp_tree_map_t;
1132
1133 static slp_tree
1134 vect_build_slp_tree_2 (vec_info *vinfo,
1135 vec<stmt_vec_info> stmts, unsigned int group_size,
1136 poly_uint64 *max_nunits,
1137 bool *matches, unsigned *npermutes, unsigned *tree_size,
1138 scalar_stmts_to_slp_tree_map_t *bst_map);
1139
1140 static slp_tree
1141 vect_build_slp_tree (vec_info *vinfo,
1142 vec<stmt_vec_info> stmts, unsigned int group_size,
1143 poly_uint64 *max_nunits,
1144 bool *matches, unsigned *npermutes, unsigned *tree_size,
1145 scalar_stmts_to_slp_tree_map_t *bst_map)
1146 {
1147 if (slp_tree *leader = bst_map->get (stmts))
1148 {
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1151 *leader ? "" : "failed ", *leader);
1152 if (*leader)
1153 {
1154 (*leader)->refcnt++;
1155 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1156 }
1157 return *leader;
1158 }
1159 poly_uint64 this_max_nunits = 1;
1160 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1161 &this_max_nunits,
1162 matches, npermutes, tree_size, bst_map);
1163 if (res)
1164 {
1165 res->max_nunits = this_max_nunits;
1166 vect_update_max_nunits (max_nunits, this_max_nunits);
1167 /* Keep a reference for the bst_map use. */
1168 res->refcnt++;
1169 }
1170 bst_map->put (stmts.copy (), res);
1171 return res;
1172 }
1173
1174 /* Recursively build an SLP tree starting from NODE.
1175 Fail (and return a value not equal to zero) if def-stmts are not
1176 isomorphic, require data permutation or are of unsupported types of
1177 operation. Otherwise, return 0.
1178 The value returned is the depth in the SLP tree where a mismatch
1179 was found. */
1180
1181 static slp_tree
1182 vect_build_slp_tree_2 (vec_info *vinfo,
1183 vec<stmt_vec_info> stmts, unsigned int group_size,
1184 poly_uint64 *max_nunits,
1185 bool *matches, unsigned *npermutes, unsigned *tree_size,
1186 scalar_stmts_to_slp_tree_map_t *bst_map)
1187 {
1188 unsigned nops, i, this_tree_size = 0;
1189 poly_uint64 this_max_nunits = *max_nunits;
1190 slp_tree node;
1191
1192 matches[0] = false;
1193
1194 stmt_vec_info stmt_info = stmts[0];
1195 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1196 nops = gimple_call_num_args (stmt);
1197 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1198 {
1199 nops = gimple_num_ops (stmt) - 1;
1200 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1201 nops++;
1202 }
1203 else if (is_a <gphi *> (stmt_info->stmt))
1204 nops = 0;
1205 else
1206 return NULL;
1207
1208 /* If the SLP node is a PHI (induction or reduction), terminate
1209 the recursion. */
1210 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1211 {
1212 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1213 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1214 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1215 max_nunits))
1216 return NULL;
1217
1218 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1219 /* Induction from different IVs is not supported. */
1220 if (def_type == vect_induction_def)
1221 {
1222 stmt_vec_info other_info;
1223 FOR_EACH_VEC_ELT (stmts, i, other_info)
1224 if (stmt_info != other_info)
1225 return NULL;
1226 }
1227 else if (def_type == vect_reduction_def
1228 || def_type == vect_double_reduction_def
1229 || def_type == vect_nested_cycle)
1230 {
1231 /* Else def types have to match. */
1232 stmt_vec_info other_info;
1233 FOR_EACH_VEC_ELT (stmts, i, other_info)
1234 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1235 return NULL;
1236 }
1237 else
1238 return NULL;
1239 (*tree_size)++;
1240 node = vect_create_new_slp_node (stmts, 0);
1241 SLP_TREE_VECTYPE (node) = vectype;
1242 return node;
1243 }
1244
1245
1246 bool two_operators = false;
1247 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1248 tree vectype = NULL_TREE;
1249 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1250 &this_max_nunits, matches, &two_operators,
1251 &vectype))
1252 return NULL;
1253
1254 /* If the SLP node is a load, terminate the recursion unless masked. */
1255 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1256 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1257 {
1258 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1259 {
1260 /* Masked load. */
1261 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1262 nops = 1;
1263 }
1264 else
1265 {
1266 *max_nunits = this_max_nunits;
1267 (*tree_size)++;
1268 node = vect_create_new_slp_node (stmts, 0);
1269 SLP_TREE_VECTYPE (node) = vectype;
1270 /* And compute the load permutation. Whether it is actually
1271 a permutation depends on the unrolling factor which is
1272 decided later. */
1273 vec<unsigned> load_permutation;
1274 int j;
1275 stmt_vec_info load_info;
1276 load_permutation.create (group_size);
1277 stmt_vec_info first_stmt_info
1278 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1280 {
1281 int load_place = vect_get_place_in_interleaving_chain
1282 (load_info, first_stmt_info);
1283 gcc_assert (load_place != -1);
1284 load_permutation.safe_push (load_place);
1285 }
1286 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1287 return node;
1288 }
1289 }
1290
1291 /* Get at the operands, verifying they are compatible. */
1292 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1293 slp_oprnd_info oprnd_info;
1294 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1295 {
1296 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1297 stmts, i, &oprnds_info);
1298 if (res != 0)
1299 matches[(res == -1) ? 0 : i] = false;
1300 if (!matches[0])
1301 break;
1302 }
1303 for (i = 0; i < group_size; ++i)
1304 if (!matches[i])
1305 {
1306 vect_free_oprnd_info (oprnds_info);
1307 return NULL;
1308 }
1309
1310 auto_vec<slp_tree, 4> children;
1311
1312 stmt_info = stmts[0];
1313
1314 /* Create SLP_TREE nodes for the definition node/s. */
1315 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1316 {
1317 slp_tree child;
1318 unsigned int j;
1319
1320 if (oprnd_info->first_dt == vect_uninitialized_def)
1321 {
1322 /* COND_EXPR have one too many eventually if the condition
1323 is a SSA name. */
1324 gcc_assert (i == 3 && nops == 4);
1325 continue;
1326 }
1327
1328 if (oprnd_info->first_dt != vect_internal_def
1329 && oprnd_info->first_dt != vect_reduction_def
1330 && oprnd_info->first_dt != vect_induction_def)
1331 {
1332 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1333 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1334 oprnd_info->ops = vNULL;
1335 children.safe_push (invnode);
1336 continue;
1337 }
1338
1339 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1340 group_size, &this_max_nunits,
1341 matches, npermutes,
1342 &this_tree_size, bst_map)) != NULL)
1343 {
1344 oprnd_info->def_stmts = vNULL;
1345 children.safe_push (child);
1346 continue;
1347 }
1348
1349 /* If the SLP build failed fatally and we analyze a basic-block
1350 simply treat nodes we fail to build as externally defined
1351 (and thus build vectors from the scalar defs).
1352 The cost model will reject outright expensive cases.
1353 ??? This doesn't treat cases where permutation ultimatively
1354 fails (or we don't try permutation below). Ideally we'd
1355 even compute a permutation that will end up with the maximum
1356 SLP tree size... */
1357 if (is_a <bb_vec_info> (vinfo)
1358 && !matches[0]
1359 /* ??? Rejecting patterns this way doesn't work. We'd have to
1360 do extra work to cancel the pattern so the uses see the
1361 scalar version. */
1362 && !is_pattern_stmt_p (stmt_info)
1363 && !oprnd_info->any_pattern)
1364 {
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE, vect_location,
1367 "Building vector operands from scalars\n");
1368 this_tree_size++;
1369 child = vect_create_new_slp_node (oprnd_info->ops);
1370 children.safe_push (child);
1371 oprnd_info->ops = vNULL;
1372 oprnd_info->def_stmts = vNULL;
1373 continue;
1374 }
1375
1376 /* If the SLP build for operand zero failed and operand zero
1377 and one can be commutated try that for the scalar stmts
1378 that failed the match. */
1379 if (i == 0
1380 /* A first scalar stmt mismatch signals a fatal mismatch. */
1381 && matches[0]
1382 /* ??? For COND_EXPRs we can swap the comparison operands
1383 as well as the arms under some constraints. */
1384 && nops == 2
1385 && oprnds_info[1]->first_dt == vect_internal_def
1386 && is_gimple_assign (stmt_info->stmt)
1387 /* Swapping operands for reductions breaks assumptions later on. */
1388 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1389 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1390 /* Do so only if the number of not successful permutes was nor more
1391 than a cut-ff as re-trying the recursive match on
1392 possibly each level of the tree would expose exponential
1393 behavior. */
1394 && *npermutes < 4)
1395 {
1396 /* See whether we can swap the matching or the non-matching
1397 stmt operands. */
1398 bool swap_not_matching = true;
1399 do
1400 {
1401 for (j = 0; j < group_size; ++j)
1402 {
1403 if (matches[j] != !swap_not_matching)
1404 continue;
1405 stmt_vec_info stmt_info = stmts[j];
1406 /* Verify if we can swap operands of this stmt. */
1407 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1408 if (!stmt
1409 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1410 {
1411 if (!swap_not_matching)
1412 goto fail;
1413 swap_not_matching = false;
1414 break;
1415 }
1416 }
1417 }
1418 while (j != group_size);
1419
1420 /* Swap mismatched definition stmts. */
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE, vect_location,
1423 "Re-trying with swapped operands of stmts ");
1424 for (j = 0; j < group_size; ++j)
1425 if (matches[j] == !swap_not_matching)
1426 {
1427 std::swap (oprnds_info[0]->def_stmts[j],
1428 oprnds_info[1]->def_stmts[j]);
1429 std::swap (oprnds_info[0]->ops[j],
1430 oprnds_info[1]->ops[j]);
1431 if (dump_enabled_p ())
1432 dump_printf (MSG_NOTE, "%d ", j);
1433 }
1434 if (dump_enabled_p ())
1435 dump_printf (MSG_NOTE, "\n");
1436 /* And try again with scratch 'matches' ... */
1437 bool *tem = XALLOCAVEC (bool, group_size);
1438 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1439 group_size, &this_max_nunits,
1440 tem, npermutes,
1441 &this_tree_size, bst_map)) != NULL)
1442 {
1443 oprnd_info->def_stmts = vNULL;
1444 children.safe_push (child);
1445 continue;
1446 }
1447
1448 ++*npermutes;
1449 }
1450
1451 fail:
1452 gcc_assert (child == NULL);
1453 FOR_EACH_VEC_ELT (children, j, child)
1454 vect_free_slp_tree (child, false);
1455 vect_free_oprnd_info (oprnds_info);
1456 return NULL;
1457 }
1458
1459 vect_free_oprnd_info (oprnds_info);
1460
1461 /* If we have all children of a non-unary child built up from
1462 uniform scalars then just throw that away, causing it built up
1463 from scalars. */
1464 if (nops > 1
1465 && is_a <bb_vec_info> (vinfo)
1466 /* ??? Rejecting patterns this way doesn't work. We'd have to
1467 do extra work to cancel the pattern so the uses see the
1468 scalar version. */
1469 && !is_pattern_stmt_p (stmt_info))
1470 {
1471 slp_tree child;
1472 unsigned j;
1473 FOR_EACH_VEC_ELT (children, j, child)
1474 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1475 || !vect_slp_tree_uniform_p (child))
1476 break;
1477 if (!child)
1478 {
1479 /* Roll back. */
1480 matches[0] = false;
1481 FOR_EACH_VEC_ELT (children, j, child)
1482 vect_free_slp_tree (child, false);
1483
1484 if (dump_enabled_p ())
1485 dump_printf_loc (MSG_NOTE, vect_location,
1486 "Building parent vector operands from "
1487 "scalars instead\n");
1488 return NULL;
1489 }
1490 }
1491
1492 *tree_size += this_tree_size + 1;
1493 *max_nunits = this_max_nunits;
1494
1495 if (two_operators)
1496 {
1497 /* ??? We'd likely want to either cache in bst_map sth like
1498 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1499 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1500 explicit stmts to put in so the keying on 'stmts' doesn't
1501 work (but we have the same issue with nodes that use 'ops'). */
1502 slp_tree one = new _slp_tree;
1503 slp_tree two = new _slp_tree;
1504 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1505 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1506 SLP_TREE_VECTYPE (one) = vectype;
1507 SLP_TREE_VECTYPE (two) = vectype;
1508 SLP_TREE_CHILDREN (one).safe_splice (children);
1509 SLP_TREE_CHILDREN (two).safe_splice (children);
1510 slp_tree child;
1511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1512 child->refcnt++;
1513
1514 /* Here we record the original defs since this
1515 node represents the final lane configuration. */
1516 node = vect_create_new_slp_node (stmts, 2);
1517 SLP_TREE_VECTYPE (node) = vectype;
1518 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1519 SLP_TREE_CHILDREN (node).quick_push (one);
1520 SLP_TREE_CHILDREN (node).quick_push (two);
1521 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1522 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1523 enum tree_code ocode = ERROR_MARK;
1524 stmt_vec_info ostmt_info;
1525 unsigned j = 0;
1526 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1527 {
1528 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1529 if (gimple_assign_rhs_code (ostmt) != code0)
1530 {
1531 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1532 ocode = gimple_assign_rhs_code (ostmt);
1533 j = i;
1534 }
1535 else
1536 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1537 }
1538 SLP_TREE_CODE (one) = code0;
1539 SLP_TREE_CODE (two) = ocode;
1540 SLP_TREE_LANES (one) = stmts.length ();
1541 SLP_TREE_LANES (two) = stmts.length ();
1542 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1543 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1544 return node;
1545 }
1546
1547 node = vect_create_new_slp_node (stmts, nops);
1548 SLP_TREE_VECTYPE (node) = vectype;
1549 SLP_TREE_CHILDREN (node).splice (children);
1550 return node;
1551 }
1552
1553 /* Dump a single SLP tree NODE. */
1554
1555 static void
1556 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1557 slp_tree node)
1558 {
1559 unsigned i, j;
1560 slp_tree child;
1561 stmt_vec_info stmt_info;
1562 tree op;
1563
1564 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1565 dump_user_location_t user_loc = loc.get_user_location ();
1566 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1567 SLP_TREE_DEF_TYPE (node) == vect_external_def
1568 ? " (external)"
1569 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1570 ? " (constant)"
1571 : ""), node,
1572 estimated_poly_value (node->max_nunits), node->refcnt);
1573 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1574 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1575 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1576 else
1577 {
1578 dump_printf_loc (metadata, user_loc, "\t{ ");
1579 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1580 dump_printf (metadata, "%T%s ", op,
1581 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1582 dump_printf (metadata, "}\n");
1583 }
1584 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1585 {
1586 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1587 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1588 dump_printf (dump_kind, " %u", j);
1589 dump_printf (dump_kind, " }\n");
1590 }
1591 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1592 {
1593 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1594 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1595 dump_printf (dump_kind, " %u[%u]",
1596 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1597 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1598 dump_printf (dump_kind, " }\n");
1599 }
1600 if (SLP_TREE_CHILDREN (node).is_empty ())
1601 return;
1602 dump_printf_loc (metadata, user_loc, "\tchildren");
1603 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1604 dump_printf (dump_kind, " %p", (void *)child);
1605 dump_printf (dump_kind, "\n");
1606 }
1607
1608 DEBUG_FUNCTION void
1609 debug (slp_tree node)
1610 {
1611 debug_dump_context ctx;
1612 vect_print_slp_tree (MSG_NOTE,
1613 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1614 node);
1615 }
1616
1617 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1618
1619 static void
1620 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1621 slp_tree node, hash_set<slp_tree> &visited)
1622 {
1623 unsigned i;
1624 slp_tree child;
1625
1626 if (visited.add (node))
1627 return;
1628
1629 vect_print_slp_tree (dump_kind, loc, node);
1630
1631 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1632 vect_print_slp_graph (dump_kind, loc, child, visited);
1633 }
1634
1635 static void
1636 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1637 slp_tree entry)
1638 {
1639 hash_set<slp_tree> visited;
1640 vect_print_slp_graph (dump_kind, loc, entry, visited);
1641 }
1642
1643 /* Mark the tree rooted at NODE with PURE_SLP. */
1644
1645 static void
1646 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1647 {
1648 int i;
1649 stmt_vec_info stmt_info;
1650 slp_tree child;
1651
1652 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1653 return;
1654
1655 if (visited.add (node))
1656 return;
1657
1658 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1659 STMT_SLP_TYPE (stmt_info) = pure_slp;
1660
1661 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1662 vect_mark_slp_stmts (child, visited);
1663 }
1664
1665 static void
1666 vect_mark_slp_stmts (slp_tree node)
1667 {
1668 hash_set<slp_tree> visited;
1669 vect_mark_slp_stmts (node, visited);
1670 }
1671
1672 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1673
1674 static void
1675 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1676 {
1677 int i;
1678 stmt_vec_info stmt_info;
1679 slp_tree child;
1680
1681 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1682 return;
1683
1684 if (visited.add (node))
1685 return;
1686
1687 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1688 {
1689 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1690 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1691 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1692 }
1693
1694 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1695 vect_mark_slp_stmts_relevant (child, visited);
1696 }
1697
1698 static void
1699 vect_mark_slp_stmts_relevant (slp_tree node)
1700 {
1701 hash_set<slp_tree> visited;
1702 vect_mark_slp_stmts_relevant (node, visited);
1703 }
1704
1705 /* Copy the SLP subtree rooted at NODE. */
1706
1707 static slp_tree
1708 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1709 {
1710 unsigned i;
1711
1712 bool existed_p;
1713 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1714 if (existed_p)
1715 return copy_ref;
1716
1717 copy_ref = new _slp_tree;
1718 slp_tree copy = copy_ref;
1719 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1720 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1721 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1722 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1723 copy->max_nunits = node->max_nunits;
1724 copy->refcnt = 0;
1725 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1726 {
1727 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1728 stmt_vec_info stmt_info;
1729 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1730 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1731 }
1732 if (SLP_TREE_SCALAR_OPS (node).exists ())
1733 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1734 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1735 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1736 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1737 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1738 if (SLP_TREE_CHILDREN (node).exists ())
1739 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1740 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1741
1742 slp_tree child;
1743 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1744 {
1745 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1746 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1747 }
1748 return copy;
1749 }
1750
1751 /* Rearrange the statements of NODE according to PERMUTATION. */
1752
1753 static void
1754 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1755 vec<unsigned> permutation,
1756 hash_set<slp_tree> &visited)
1757 {
1758 unsigned int i;
1759 slp_tree child;
1760
1761 if (visited.add (node))
1762 return;
1763
1764 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1765 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1766
1767 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1768 {
1769 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1770 vec<stmt_vec_info> tmp_stmts;
1771 tmp_stmts.create (group_size);
1772 tmp_stmts.quick_grow (group_size);
1773 stmt_vec_info stmt_info;
1774 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1775 tmp_stmts[permutation[i]] = stmt_info;
1776 SLP_TREE_SCALAR_STMTS (node).release ();
1777 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1778 }
1779 if (SLP_TREE_SCALAR_OPS (node).exists ())
1780 {
1781 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1782 vec<tree> tmp_ops;
1783 tmp_ops.create (group_size);
1784 tmp_ops.quick_grow (group_size);
1785 tree op;
1786 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1787 tmp_ops[permutation[i]] = op;
1788 SLP_TREE_SCALAR_OPS (node).release ();
1789 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1790 }
1791 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1792 {
1793 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1794 for (i = 0; i < group_size; ++i)
1795 SLP_TREE_LANE_PERMUTATION (node)[i].second
1796 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1797 }
1798 }
1799
1800
1801 /* Attempt to reorder stmts in a reduction chain so that we don't
1802 require any load permutation. Return true if that was possible,
1803 otherwise return false. */
1804
1805 static bool
1806 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1807 {
1808 unsigned int i, j;
1809 unsigned int lidx;
1810 slp_tree node, load;
1811
1812 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1813 return false;
1814
1815 /* Compare all the permutation sequences to the first one. We know
1816 that at least one load is permuted. */
1817 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1818 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1819 return false;
1820 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1821 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1822 {
1823 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1824 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1825 return false;
1826 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1827 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1828 return false;
1829 }
1830
1831 /* Check that the loads in the first sequence are different and there
1832 are no gaps between them. */
1833 auto_sbitmap load_index (group_size);
1834 bitmap_clear (load_index);
1835 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1836 {
1837 if (lidx >= group_size)
1838 return false;
1839 if (bitmap_bit_p (load_index, lidx))
1840 return false;
1841
1842 bitmap_set_bit (load_index, lidx);
1843 }
1844 for (i = 0; i < group_size; i++)
1845 if (!bitmap_bit_p (load_index, i))
1846 return false;
1847
1848 /* This permutation is valid for reduction. Since the order of the
1849 statements in the nodes is not important unless they are memory
1850 accesses, we can rearrange the statements in all the nodes
1851 according to the order of the loads. */
1852
1853 /* We have to unshare the SLP tree we modify. */
1854 hash_map<slp_tree, slp_tree> map;
1855 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1856 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1857 unshared->refcnt++;
1858 SLP_INSTANCE_TREE (slp_instn) = unshared;
1859 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1860 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1861 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1862
1863 /* Do the actual re-arrangement. */
1864 hash_set<slp_tree> visited;
1865 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1866 node->load_permutation, visited);
1867
1868 /* We are done, no actual permutations need to be generated. */
1869 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1870 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1871 {
1872 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1873 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1874 /* But we have to keep those permutations that are required because
1875 of handling of gaps. */
1876 if (known_eq (unrolling_factor, 1U)
1877 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1878 && DR_GROUP_GAP (first_stmt_info) == 0))
1879 SLP_TREE_LOAD_PERMUTATION (node).release ();
1880 else
1881 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1882 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1883 }
1884
1885 return true;
1886 }
1887
1888 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1889
1890 static void
1891 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1892 hash_set<slp_tree> &visited)
1893 {
1894 if (visited.add (node))
1895 return;
1896
1897 if (SLP_TREE_CHILDREN (node).length () == 0)
1898 {
1899 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1900 return;
1901 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1902 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1903 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1904 loads.safe_push (node);
1905 }
1906 else
1907 {
1908 unsigned i;
1909 slp_tree child;
1910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1911 vect_gather_slp_loads (loads, child, visited);
1912 }
1913 }
1914
1915 static void
1916 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1917 {
1918 hash_set<slp_tree> visited;
1919 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1920 }
1921
1922
1923 /* Find the last store in SLP INSTANCE. */
1924
1925 stmt_vec_info
1926 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1927 {
1928 stmt_vec_info last = NULL;
1929 stmt_vec_info stmt_vinfo;
1930
1931 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1932 {
1933 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1934 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1935 }
1936
1937 return last;
1938 }
1939
1940 /* Find the first stmt in NODE. */
1941
1942 stmt_vec_info
1943 vect_find_first_scalar_stmt_in_slp (slp_tree node)
1944 {
1945 stmt_vec_info first = NULL;
1946 stmt_vec_info stmt_vinfo;
1947
1948 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1949 {
1950 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1951 if (!first
1952 || get_later_stmt (stmt_vinfo, first) == first)
1953 first = stmt_vinfo;
1954 }
1955
1956 return first;
1957 }
1958
1959 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1960 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1961 (also containing the first GROUP1_SIZE stmts, since stores are
1962 consecutive), the second containing the remainder.
1963 Return the first stmt in the second group. */
1964
1965 static stmt_vec_info
1966 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1967 {
1968 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1969 gcc_assert (group1_size > 0);
1970 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1971 gcc_assert (group2_size > 0);
1972 DR_GROUP_SIZE (first_vinfo) = group1_size;
1973
1974 stmt_vec_info stmt_info = first_vinfo;
1975 for (unsigned i = group1_size; i > 1; i--)
1976 {
1977 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1978 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1979 }
1980 /* STMT is now the last element of the first group. */
1981 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1982 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1983
1984 DR_GROUP_SIZE (group2) = group2_size;
1985 for (stmt_info = group2; stmt_info;
1986 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1987 {
1988 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1989 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1990 }
1991
1992 /* For the second group, the DR_GROUP_GAP is that before the original group,
1993 plus skipping over the first vector. */
1994 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1995
1996 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1997 DR_GROUP_GAP (first_vinfo) += group2_size;
1998
1999 if (dump_enabled_p ())
2000 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2001 group1_size, group2_size);
2002
2003 return group2;
2004 }
2005
2006 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2007 statements and a vector of NUNITS elements. */
2008
2009 static poly_uint64
2010 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2011 {
2012 return exact_div (common_multiple (nunits, group_size), group_size);
2013 }
2014
2015 /* Analyze an SLP instance starting from a group of grouped stores. Call
2016 vect_build_slp_tree to build a tree of packed stmts if possible.
2017 Return FALSE if it's impossible to SLP any stmt in the loop. */
2018
2019 static bool
2020 vect_analyze_slp_instance (vec_info *vinfo,
2021 scalar_stmts_to_slp_tree_map_t *bst_map,
2022 stmt_vec_info stmt_info, unsigned max_tree_size)
2023 {
2024 slp_instance new_instance;
2025 slp_tree node;
2026 unsigned int group_size;
2027 tree vectype, scalar_type = NULL_TREE;
2028 unsigned int i;
2029 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2030 vec<stmt_vec_info> scalar_stmts;
2031 bool constructor = false;
2032
2033 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2034 {
2035 scalar_type = TREE_TYPE (DR_REF (dr));
2036 group_size = DR_GROUP_SIZE (stmt_info);
2037 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2038 }
2039 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2040 {
2041 gcc_assert (is_a <loop_vec_info> (vinfo));
2042 vectype = STMT_VINFO_VECTYPE (stmt_info);
2043 group_size = REDUC_GROUP_SIZE (stmt_info);
2044 }
2045 else if (is_gimple_assign (stmt_info->stmt)
2046 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2047 {
2048 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2049 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2050 constructor = true;
2051 }
2052 else
2053 {
2054 gcc_assert (is_a <loop_vec_info> (vinfo));
2055 vectype = STMT_VINFO_VECTYPE (stmt_info);
2056 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2057 }
2058
2059 if (!vectype)
2060 {
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "Build SLP failed: unsupported data-type %T\n",
2064 scalar_type);
2065
2066 return false;
2067 }
2068 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2069
2070 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2071 scalar_stmts.create (group_size);
2072 stmt_vec_info next_info = stmt_info;
2073 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2074 {
2075 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2076 while (next_info)
2077 {
2078 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2079 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2080 }
2081 }
2082 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2083 {
2084 /* Collect the reduction stmts and store them in
2085 SLP_TREE_SCALAR_STMTS. */
2086 while (next_info)
2087 {
2088 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2089 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2090 }
2091 /* Mark the first element of the reduction chain as reduction to properly
2092 transform the node. In the reduction analysis phase only the last
2093 element of the chain is marked as reduction. */
2094 STMT_VINFO_DEF_TYPE (stmt_info)
2095 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2096 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2097 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2098 }
2099 else if (constructor)
2100 {
2101 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2102 tree val;
2103 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2104 {
2105 if (TREE_CODE (val) == SSA_NAME)
2106 {
2107 gimple* def = SSA_NAME_DEF_STMT (val);
2108 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2109 /* Value is defined in another basic block. */
2110 if (!def_info)
2111 return false;
2112 def_info = vect_stmt_to_vectorize (def_info);
2113 scalar_stmts.safe_push (def_info);
2114 }
2115 else
2116 return false;
2117 }
2118 if (dump_enabled_p ())
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Analyzing vectorizable constructor: %G\n",
2121 stmt_info->stmt);
2122 }
2123 else
2124 {
2125 /* Collect reduction statements. */
2126 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2127 for (i = 0; reductions.iterate (i, &next_info); i++)
2128 scalar_stmts.safe_push (next_info);
2129 }
2130
2131 /* Build the tree for the SLP instance. */
2132 bool *matches = XALLOCAVEC (bool, group_size);
2133 unsigned npermutes = 0;
2134 poly_uint64 max_nunits = nunits;
2135 unsigned tree_size = 0;
2136 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2137 &max_nunits, matches, &npermutes,
2138 &tree_size, bst_map);
2139 if (node != NULL)
2140 {
2141 /* Calculate the unrolling factor based on the smallest type. */
2142 poly_uint64 unrolling_factor
2143 = calculate_unrolling_factor (max_nunits, group_size);
2144
2145 if (maybe_ne (unrolling_factor, 1U)
2146 && is_a <bb_vec_info> (vinfo))
2147 {
2148 unsigned HOST_WIDE_INT const_max_nunits;
2149 if (!max_nunits.is_constant (&const_max_nunits)
2150 || const_max_nunits > group_size)
2151 {
2152 if (dump_enabled_p ())
2153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154 "Build SLP failed: store group "
2155 "size not a multiple of the vector size "
2156 "in basic block SLP\n");
2157 vect_free_slp_tree (node, false);
2158 return false;
2159 }
2160 /* Fatal mismatch. */
2161 matches[group_size / const_max_nunits * const_max_nunits] = false;
2162 vect_free_slp_tree (node, false);
2163 }
2164 else
2165 {
2166 /* Create a new SLP instance. */
2167 new_instance = XNEW (class _slp_instance);
2168 SLP_INSTANCE_TREE (new_instance) = node;
2169 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2170 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2171 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2172
2173 vect_gather_slp_loads (new_instance, node);
2174 if (dump_enabled_p ())
2175 dump_printf_loc (MSG_NOTE, vect_location,
2176 "SLP size %u vs. limit %u.\n",
2177 tree_size, max_tree_size);
2178
2179 /* Check whether any load is possibly permuted. */
2180 slp_tree load_node;
2181 bool loads_permuted = false;
2182 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2183 {
2184 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2185 continue;
2186 unsigned j;
2187 stmt_vec_info load_info;
2188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2189 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2190 {
2191 loads_permuted = true;
2192 break;
2193 }
2194 }
2195
2196 /* If the loads and stores can be handled with load/store-lane
2197 instructions do not generate this SLP instance. */
2198 if (is_a <loop_vec_info> (vinfo)
2199 && loads_permuted
2200 && dr && vect_store_lanes_supported (vectype, group_size, false))
2201 {
2202 slp_tree load_node;
2203 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2204 {
2205 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2206 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2207 /* Use SLP for strided accesses (or if we can't load-lanes). */
2208 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2209 || ! vect_load_lanes_supported
2210 (STMT_VINFO_VECTYPE (stmt_vinfo),
2211 DR_GROUP_SIZE (stmt_vinfo), false))
2212 break;
2213 }
2214 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2215 {
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 "Built SLP cancelled: can use "
2219 "load/store-lanes\n");
2220 vect_free_slp_instance (new_instance, false);
2221 return false;
2222 }
2223 }
2224
2225 /* If this is a reduction chain with a conversion in front
2226 amend the SLP tree with a node for that. */
2227 if (!dr
2228 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2229 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2230 {
2231 /* Get at the conversion stmt - we know it's the single use
2232 of the last stmt of the reduction chain. */
2233 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2234 use_operand_p use_p;
2235 gimple *use_stmt;
2236 bool r = single_imm_use (gimple_assign_lhs (tem),
2237 &use_p, &use_stmt);
2238 gcc_assert (r);
2239 next_info = vinfo->lookup_stmt (use_stmt);
2240 next_info = vect_stmt_to_vectorize (next_info);
2241 scalar_stmts = vNULL;
2242 scalar_stmts.create (group_size);
2243 for (unsigned i = 0; i < group_size; ++i)
2244 scalar_stmts.quick_push (next_info);
2245 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2246 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2247 SLP_TREE_CHILDREN (conv).quick_push (node);
2248 SLP_INSTANCE_TREE (new_instance) = conv;
2249 /* We also have to fake this conversion stmt as SLP reduction
2250 group so we don't have to mess with too much code
2251 elsewhere. */
2252 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2253 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2254 }
2255
2256 vinfo->slp_instances.safe_push (new_instance);
2257
2258 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2259 the number of scalar stmts in the root in a few places.
2260 Verify that assumption holds. */
2261 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2262 .length () == group_size);
2263
2264 if (dump_enabled_p ())
2265 {
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Final SLP tree for instance:\n");
2268 vect_print_slp_graph (MSG_NOTE, vect_location,
2269 SLP_INSTANCE_TREE (new_instance));
2270 }
2271
2272 return true;
2273 }
2274 }
2275 else
2276 {
2277 /* Failed to SLP. */
2278 /* Free the allocated memory. */
2279 scalar_stmts.release ();
2280 }
2281
2282 /* For basic block SLP, try to break the group up into multiples of the
2283 vector size. */
2284 unsigned HOST_WIDE_INT const_nunits;
2285 if (is_a <bb_vec_info> (vinfo)
2286 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2287 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2288 && nunits.is_constant (&const_nunits))
2289 {
2290 /* We consider breaking the group only on VF boundaries from the existing
2291 start. */
2292 for (i = 0; i < group_size; i++)
2293 if (!matches[i]) break;
2294
2295 if (i >= const_nunits && i < group_size)
2296 {
2297 /* Split into two groups at the first vector boundary before i. */
2298 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2299 unsigned group1_size = i & ~(const_nunits - 1);
2300
2301 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2302 group1_size);
2303 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2304 max_tree_size);
2305 /* If the first non-match was in the middle of a vector,
2306 skip the rest of that vector. */
2307 if (group1_size < i)
2308 {
2309 i = group1_size + const_nunits;
2310 if (i < group_size)
2311 rest = vect_split_slp_store_group (rest, const_nunits);
2312 }
2313 if (i < group_size)
2314 res |= vect_analyze_slp_instance (vinfo, bst_map,
2315 rest, max_tree_size);
2316 return res;
2317 }
2318 /* Even though the first vector did not all match, we might be able to SLP
2319 (some) of the remainder. FORNOW ignore this possibility. */
2320 }
2321
2322 return false;
2323 }
2324
2325
2326 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2327 trees of packed scalar stmts if SLP is possible. */
2328
2329 opt_result
2330 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2331 {
2332 unsigned int i;
2333 stmt_vec_info first_element;
2334
2335 DUMP_VECT_SCOPE ("vect_analyze_slp");
2336
2337 scalar_stmts_to_slp_tree_map_t *bst_map
2338 = new scalar_stmts_to_slp_tree_map_t ();
2339
2340 /* Find SLP sequences starting from groups of grouped stores. */
2341 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2342 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2343
2344 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2345 {
2346 if (loop_vinfo->reduction_chains.length () > 0)
2347 {
2348 /* Find SLP sequences starting from reduction chains. */
2349 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2350 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2351 max_tree_size))
2352 {
2353 /* Dissolve reduction chain group. */
2354 stmt_vec_info vinfo = first_element;
2355 stmt_vec_info last = NULL;
2356 while (vinfo)
2357 {
2358 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2359 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2360 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2361 last = vinfo;
2362 vinfo = next;
2363 }
2364 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2365 /* It can be still vectorized as part of an SLP reduction. */
2366 loop_vinfo->reductions.safe_push (last);
2367 }
2368 }
2369
2370 /* Find SLP sequences starting from groups of reductions. */
2371 if (loop_vinfo->reductions.length () > 1)
2372 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2373 max_tree_size);
2374 }
2375
2376 /* The map keeps a reference on SLP nodes built, release that. */
2377 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2378 it != bst_map->end (); ++it)
2379 if ((*it).second)
2380 vect_free_slp_tree ((*it).second, false);
2381 delete bst_map;
2382
2383 /* Optimize permutations in SLP reductions. */
2384 slp_instance instance;
2385 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2386 {
2387 slp_tree node = SLP_INSTANCE_TREE (instance);
2388 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2389 /* Reduction (there are no data-refs in the root).
2390 In reduction chain the order of the loads is not important. */
2391 if (!STMT_VINFO_DATA_REF (stmt_info)
2392 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2393 vect_attempt_slp_rearrange_stmts (instance);
2394 }
2395
2396 /* Gather all loads in the SLP graph. */
2397 hash_set<slp_tree> visited;
2398 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2399 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2400 visited);
2401
2402 return opt_result::success ();
2403 }
2404
2405 void
2406 vect_optimize_slp (vec_info *vinfo)
2407 {
2408 slp_tree node;
2409 unsigned i;
2410 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2411 {
2412 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2413 continue;
2414
2415 /* In basic block vectorization we allow any subchain of an interleaving
2416 chain.
2417 FORNOW: not in loop SLP because of realignment complications. */
2418 if (is_a <bb_vec_info> (vinfo))
2419 {
2420 bool subchain_p = true;
2421 stmt_vec_info next_load_info = NULL;
2422 stmt_vec_info load_info;
2423 unsigned j;
2424 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2425 {
2426 if (j != 0
2427 && (next_load_info != load_info
2428 || DR_GROUP_GAP (load_info) != 1))
2429 {
2430 subchain_p = false;
2431 break;
2432 }
2433 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2434 }
2435 if (subchain_p)
2436 {
2437 SLP_TREE_LOAD_PERMUTATION (node).release ();
2438 continue;
2439 }
2440 }
2441 else
2442 {
2443 stmt_vec_info load_info;
2444 bool this_load_permuted = false;
2445 unsigned j;
2446 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2447 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2448 {
2449 this_load_permuted = true;
2450 break;
2451 }
2452 stmt_vec_info first_stmt_info
2453 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2454 if (!this_load_permuted
2455 /* The load requires permutation when unrolling exposes
2456 a gap either because the group is larger than the SLP
2457 group-size or because there is a gap between the groups. */
2458 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2459 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2460 && DR_GROUP_GAP (first_stmt_info) == 0)))
2461 {
2462 SLP_TREE_LOAD_PERMUTATION (node).release ();
2463 continue;
2464 }
2465 }
2466 }
2467 }
2468
2469
2470 /* For each possible SLP instance decide whether to SLP it and calculate overall
2471 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2472 least one instance. */
2473
2474 bool
2475 vect_make_slp_decision (loop_vec_info loop_vinfo)
2476 {
2477 unsigned int i;
2478 poly_uint64 unrolling_factor = 1;
2479 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2480 slp_instance instance;
2481 int decided_to_slp = 0;
2482
2483 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2484
2485 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2486 {
2487 /* FORNOW: SLP if you can. */
2488 /* All unroll factors have the form:
2489
2490 GET_MODE_SIZE (vinfo->vector_mode) * X
2491
2492 for some rational X, so they must have a common multiple. */
2493 unrolling_factor
2494 = force_common_multiple (unrolling_factor,
2495 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2496
2497 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2498 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2499 loop-based vectorization. Such stmts will be marked as HYBRID. */
2500 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2501 decided_to_slp++;
2502 }
2503
2504 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2505
2506 if (decided_to_slp && dump_enabled_p ())
2507 {
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Decided to SLP %d instances. Unrolling factor ",
2510 decided_to_slp);
2511 dump_dec (MSG_NOTE, unrolling_factor);
2512 dump_printf (MSG_NOTE, "\n");
2513 }
2514
2515 return (decided_to_slp > 0);
2516 }
2517
2518 /* Private data for vect_detect_hybrid_slp. */
2519 struct vdhs_data
2520 {
2521 loop_vec_info loop_vinfo;
2522 vec<stmt_vec_info> *worklist;
2523 };
2524
2525 /* Walker for walk_gimple_op. */
2526
2527 static tree
2528 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2529 {
2530 walk_stmt_info *wi = (walk_stmt_info *)data;
2531 vdhs_data *dat = (vdhs_data *)wi->info;
2532
2533 if (wi->is_lhs)
2534 return NULL_TREE;
2535
2536 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2537 if (!def_stmt_info)
2538 return NULL_TREE;
2539 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2540 if (PURE_SLP_STMT (def_stmt_info))
2541 {
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2544 def_stmt_info->stmt);
2545 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2546 dat->worklist->safe_push (def_stmt_info);
2547 }
2548
2549 return NULL_TREE;
2550 }
2551
2552 /* Find stmts that must be both vectorized and SLPed. */
2553
2554 void
2555 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2556 {
2557 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2558
2559 /* All stmts participating in SLP are marked pure_slp, all other
2560 stmts are loop_vect.
2561 First collect all loop_vect stmts into a worklist. */
2562 auto_vec<stmt_vec_info> worklist;
2563 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2564 {
2565 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2566 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2567 gsi_next (&gsi))
2568 {
2569 gphi *phi = gsi.phi ();
2570 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2571 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2572 worklist.safe_push (stmt_info);
2573 }
2574 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2575 gsi_next (&gsi))
2576 {
2577 gimple *stmt = gsi_stmt (gsi);
2578 if (is_gimple_debug (stmt))
2579 continue;
2580 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2581 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2582 {
2583 for (gimple_stmt_iterator gsi2
2584 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2585 !gsi_end_p (gsi2); gsi_next (&gsi2))
2586 {
2587 stmt_vec_info patt_info
2588 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2589 if (!STMT_SLP_TYPE (patt_info)
2590 && STMT_VINFO_RELEVANT (patt_info))
2591 worklist.safe_push (patt_info);
2592 }
2593 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2594 }
2595 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2596 worklist.safe_push (stmt_info);
2597 }
2598 }
2599
2600 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2601 mark any SLP vectorized stmt as hybrid.
2602 ??? We're visiting def stmts N times (once for each non-SLP and
2603 once for each hybrid-SLP use). */
2604 walk_stmt_info wi;
2605 vdhs_data dat;
2606 dat.worklist = &worklist;
2607 dat.loop_vinfo = loop_vinfo;
2608 memset (&wi, 0, sizeof (wi));
2609 wi.info = (void *)&dat;
2610 while (!worklist.is_empty ())
2611 {
2612 stmt_vec_info stmt_info = worklist.pop ();
2613 /* Since SSA operands are not set up for pattern stmts we need
2614 to use walk_gimple_op. */
2615 wi.is_lhs = 0;
2616 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2617 }
2618 }
2619
2620
2621 /* Initialize a bb_vec_info struct for the statements between
2622 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2623
2624 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2625 gimple_stmt_iterator region_end_in,
2626 vec_info_shared *shared)
2627 : vec_info (vec_info::bb, init_cost (NULL), shared),
2628 bb (gsi_bb (region_begin_in)),
2629 region_begin (region_begin_in),
2630 region_end (region_end_in)
2631 {
2632 for (gimple *stmt : this->region_stmts ())
2633 {
2634 gimple_set_uid (stmt, 0);
2635 if (is_gimple_debug (stmt))
2636 continue;
2637 add_stmt (stmt);
2638 }
2639
2640 bb->aux = this;
2641 }
2642
2643
2644 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2645 stmts in the basic block. */
2646
2647 _bb_vec_info::~_bb_vec_info ()
2648 {
2649 for (gimple *stmt : this->region_stmts ())
2650 /* Reset region marker. */
2651 gimple_set_uid (stmt, -1);
2652
2653 bb->aux = NULL;
2654 }
2655
2656 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2657 given then that child nodes have already been processed, and that
2658 their def types currently match their SLP node's def type. */
2659
2660 static bool
2661 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2662 slp_instance node_instance,
2663 stmt_vector_for_cost *cost_vec)
2664 {
2665 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2666 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2667
2668 /* Calculate the number of vector statements to be created for the
2669 scalar stmts in this node. For SLP reductions it is equal to the
2670 number of vector statements in the children (which has already been
2671 calculated by the recursive call). Otherwise it is the number of
2672 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2673 VF divided by the number of elements in a vector. */
2674 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2675 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2676 {
2677 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2678 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2679 {
2680 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2681 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2682 break;
2683 }
2684 }
2685 else
2686 {
2687 poly_uint64 vf;
2688 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2689 vf = loop_vinfo->vectorization_factor;
2690 else
2691 vf = 1;
2692 unsigned int group_size = SLP_TREE_LANES (node);
2693 tree vectype = SLP_TREE_VECTYPE (node);
2694 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2695 = vect_get_num_vectors (vf * group_size, vectype);
2696 }
2697
2698 /* Handle purely internal nodes. */
2699 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2700 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2701
2702 bool dummy;
2703 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2704 node, node_instance, cost_vec);
2705 }
2706
2707 /* Try to build NODE from scalars, returning true on success.
2708 NODE_INSTANCE is the SLP instance that contains NODE. */
2709
2710 static bool
2711 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2712 slp_instance node_instance)
2713 {
2714 stmt_vec_info stmt_info;
2715 unsigned int i;
2716
2717 if (!is_a <bb_vec_info> (vinfo)
2718 || node == SLP_INSTANCE_TREE (node_instance)
2719 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2720 return false;
2721
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE, vect_location,
2724 "Building vector operands from scalars instead\n");
2725
2726 /* Don't remove and free the child nodes here, since they could be
2727 referenced by other structures. The analysis and scheduling phases
2728 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2729 unsigned int group_size = SLP_TREE_LANES (node);
2730 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2731 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2733 {
2734 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2735 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2736 }
2737 return true;
2738 }
2739
2740 /* Compute the prologue cost for invariant or constant operands represented
2741 by NODE. */
2742
2743 static void
2744 vect_prologue_cost_for_slp (slp_tree node,
2745 stmt_vector_for_cost *cost_vec)
2746 {
2747 /* Without looking at the actual initializer a vector of
2748 constants can be implemented as load from the constant pool.
2749 When all elements are the same we can use a splat. */
2750 tree vectype = SLP_TREE_VECTYPE (node);
2751 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2752 unsigned num_vects_to_check;
2753 unsigned HOST_WIDE_INT const_nunits;
2754 unsigned nelt_limit;
2755 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2756 && ! multiple_p (const_nunits, group_size))
2757 {
2758 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2759 nelt_limit = const_nunits;
2760 }
2761 else
2762 {
2763 /* If either the vector has variable length or the vectors
2764 are composed of repeated whole groups we only need to
2765 cost construction once. All vectors will be the same. */
2766 num_vects_to_check = 1;
2767 nelt_limit = group_size;
2768 }
2769 tree elt = NULL_TREE;
2770 unsigned nelt = 0;
2771 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2772 {
2773 unsigned si = j % group_size;
2774 if (nelt == 0)
2775 elt = SLP_TREE_SCALAR_OPS (node)[si];
2776 /* ??? We're just tracking whether all operands of a single
2777 vector initializer are the same, ideally we'd check if
2778 we emitted the same one already. */
2779 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2780 elt = NULL_TREE;
2781 nelt++;
2782 if (nelt == nelt_limit)
2783 {
2784 record_stmt_cost (cost_vec, 1,
2785 SLP_TREE_DEF_TYPE (node) == vect_external_def
2786 ? (elt ? scalar_to_vec : vec_construct)
2787 : vector_load,
2788 NULL, vectype, 0, vect_prologue);
2789 nelt = 0;
2790 }
2791 }
2792 }
2793
2794 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2795 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2796
2797 Return true if the operations are supported. */
2798
2799 static bool
2800 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2801 slp_instance node_instance,
2802 hash_set<slp_tree> &visited,
2803 hash_set<slp_tree> &lvisited,
2804 stmt_vector_for_cost *cost_vec)
2805 {
2806 int i, j;
2807 slp_tree child;
2808
2809 /* Assume we can code-generate all invariants. */
2810 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2811 return true;
2812
2813 /* If we already analyzed the exact same set of scalar stmts we're done.
2814 We share the generated vector stmts for those.
2815 The SLP graph is acyclic so not caching whether we failed or succeeded
2816 doesn't result in any issue since we throw away the lvisited set
2817 when we fail. */
2818 if (visited.contains (node)
2819 || lvisited.add (node))
2820 return true;
2821
2822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2823 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2824 visited, lvisited, cost_vec))
2825 return false;
2826
2827 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2828 cost_vec);
2829
2830 /* When the node can be vectorized cost invariant nodes it references.
2831 This is not done in DFS order to allow the refering node
2832 vectorizable_* calls to nail down the invariant nodes vector type
2833 and possibly unshare it if it needs a different vector type than
2834 other referrers. */
2835 if (res)
2836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2837 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2838 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2839 /* Perform usual caching, note code-generation still
2840 code-gens these nodes multiple times but we expect
2841 to CSE them later. */
2842 && !visited.contains (child)
2843 && !lvisited.add (child))
2844 {
2845 /* ??? After auditing more code paths make a "default"
2846 and push the vector type from NODE to all children
2847 if it is not already set. */
2848 /* Compute the number of vectors to be generated. */
2849 tree vector_type = SLP_TREE_VECTYPE (child);
2850 if (!vector_type)
2851 {
2852 /* For shifts with a scalar argument we don't need
2853 to cost or code-generate anything.
2854 ??? Represent this more explicitely. */
2855 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2856 == shift_vec_info_type)
2857 && j == 1);
2858 continue;
2859 }
2860 unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
2861 poly_uint64 vf = 1;
2862 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2863 vf = loop_vinfo->vectorization_factor;
2864 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2865 = vect_get_num_vectors (vf * group_size, vector_type);
2866 /* And cost them. */
2867 vect_prologue_cost_for_slp (child, cost_vec);
2868 }
2869
2870 /* If this node can't be vectorized, try pruning the tree here rather
2871 than felling the whole thing. */
2872 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2873 {
2874 /* We'll need to revisit this for invariant costing and number
2875 of vectorized stmt setting. */
2876 lvisited.remove (node);
2877 res = true;
2878 }
2879
2880 return res;
2881 }
2882
2883
2884 /* Analyze statements in SLP instances of VINFO. Return true if the
2885 operations are supported. */
2886
2887 bool
2888 vect_slp_analyze_operations (vec_info *vinfo)
2889 {
2890 slp_instance instance;
2891 int i;
2892
2893 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2894
2895 hash_set<slp_tree> visited;
2896 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2897 {
2898 hash_set<slp_tree> lvisited;
2899 stmt_vector_for_cost cost_vec;
2900 cost_vec.create (2);
2901 if (!vect_slp_analyze_node_operations (vinfo,
2902 SLP_INSTANCE_TREE (instance),
2903 instance, visited, lvisited,
2904 &cost_vec)
2905 /* Instances with a root stmt require vectorized defs for the
2906 SLP tree root. */
2907 || (SLP_INSTANCE_ROOT_STMT (instance)
2908 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2909 != vect_internal_def)))
2910 {
2911 slp_tree node = SLP_INSTANCE_TREE (instance);
2912 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE, vect_location,
2915 "removing SLP instance operations starting from: %G",
2916 stmt_info->stmt);
2917 vect_free_slp_instance (instance, false);
2918 vinfo->slp_instances.ordered_remove (i);
2919 cost_vec.release ();
2920 }
2921 else
2922 {
2923 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2924 x != lvisited.end(); ++x)
2925 visited.add (*x);
2926 i++;
2927
2928 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2929 cost_vec.release ();
2930 }
2931 }
2932
2933 return !vinfo->slp_instances.is_empty ();
2934 }
2935
2936
2937 /* Compute the scalar cost of the SLP node NODE and its children
2938 and return it. Do not account defs that are marked in LIFE and
2939 update LIFE according to uses of NODE. */
2940
2941 static void
2942 vect_bb_slp_scalar_cost (vec_info *vinfo,
2943 slp_tree node, vec<bool, va_heap> *life,
2944 stmt_vector_for_cost *cost_vec,
2945 hash_set<slp_tree> &visited)
2946 {
2947 unsigned i;
2948 stmt_vec_info stmt_info;
2949 slp_tree child;
2950
2951 if (visited.add (node))
2952 return;
2953
2954 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2955 {
2956 gimple *stmt = stmt_info->stmt;
2957 ssa_op_iter op_iter;
2958 def_operand_p def_p;
2959
2960 if ((*life)[i])
2961 continue;
2962
2963 /* If there is a non-vectorized use of the defs then the scalar
2964 stmt is kept live in which case we do not account it or any
2965 required defs in the SLP children in the scalar cost. This
2966 way we make the vectorization more costly when compared to
2967 the scalar cost. */
2968 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2969 {
2970 imm_use_iterator use_iter;
2971 gimple *use_stmt;
2972 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2973 if (!is_gimple_debug (use_stmt))
2974 {
2975 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2976 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2977 {
2978 (*life)[i] = true;
2979 BREAK_FROM_IMM_USE_STMT (use_iter);
2980 }
2981 }
2982 }
2983 if ((*life)[i])
2984 continue;
2985
2986 /* Count scalar stmts only once. */
2987 if (gimple_visited_p (stmt))
2988 continue;
2989 gimple_set_visited (stmt, true);
2990
2991 vect_cost_for_stmt kind;
2992 if (STMT_VINFO_DATA_REF (stmt_info))
2993 {
2994 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2995 kind = scalar_load;
2996 else
2997 kind = scalar_store;
2998 }
2999 else if (vect_nop_conversion_p (stmt_info))
3000 continue;
3001 else
3002 kind = scalar_stmt;
3003 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3004 }
3005
3006 auto_vec<bool, 20> subtree_life;
3007 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3008 {
3009 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3010 {
3011 /* Do not directly pass LIFE to the recursive call, copy it to
3012 confine changes in the callee to the current child/subtree. */
3013 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3014 {
3015 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
3016 for (unsigned j = 0;
3017 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3018 {
3019 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3020 if (perm.first == i)
3021 subtree_life[perm.second] = (*life)[j];
3022 }
3023 }
3024 else
3025 {
3026 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3027 subtree_life.safe_splice (*life);
3028 }
3029 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3030 visited);
3031 subtree_life.truncate (0);
3032 }
3033 }
3034 }
3035
3036 /* Check if vectorization of the basic block is profitable. */
3037
3038 static bool
3039 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3040 {
3041 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3042 slp_instance instance;
3043 int i;
3044 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3045 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3046
3047 /* Calculate scalar cost. */
3048 stmt_vector_for_cost scalar_costs;
3049 scalar_costs.create (0);
3050 hash_set<slp_tree> visited;
3051 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3052 {
3053 auto_vec<bool, 20> life;
3054 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3055 vect_bb_slp_scalar_cost (bb_vinfo,
3056 SLP_INSTANCE_TREE (instance),
3057 &life, &scalar_costs, visited);
3058 }
3059 /* Unset visited flag. */
3060 stmt_info_for_cost *si;
3061 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3062 gimple_set_visited (si->stmt_info->stmt, false);
3063
3064 void *target_cost_data = init_cost (NULL);
3065 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3066 scalar_costs.release ();
3067 unsigned dummy;
3068 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3069 destroy_cost_data (target_cost_data);
3070
3071 /* Complete the target-specific cost calculation. */
3072 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3073 &vec_inside_cost, &vec_epilogue_cost);
3074
3075 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3076
3077 if (dump_enabled_p ())
3078 {
3079 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3080 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3081 vec_inside_cost);
3082 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3083 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3084 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3085 }
3086
3087 /* Vectorization is profitable if its cost is more than the cost of scalar
3088 version. Note that we err on the vector side for equal cost because
3089 the cost estimate is otherwise quite pessimistic (constant uses are
3090 free on the scalar side but cost a load on the vector side for
3091 example). */
3092 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3093 return false;
3094
3095 return true;
3096 }
3097
3098 /* Find any vectorizable constructors and add them to the grouped_store
3099 array. */
3100
3101 static void
3102 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3103 {
3104 for (gimple *stmt : bb_vinfo->region_stmts ())
3105 {
3106 gassign *assign = dyn_cast<gassign *> (stmt);
3107 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3108 continue;
3109
3110 tree rhs = gimple_assign_rhs1 (assign);
3111 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3112 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3113 CONSTRUCTOR_NELTS (rhs))
3114 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3115 || uniform_vector_p (rhs))
3116 continue;
3117
3118 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3119 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3120 }
3121 }
3122
3123 /* Check if the region described by BB_VINFO can be vectorized, returning
3124 true if so. When returning false, set FATAL to true if the same failure
3125 would prevent vectorization at other vector sizes, false if it is still
3126 worth trying other sizes. N_STMTS is the number of statements in the
3127 region. */
3128
3129 static bool
3130 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3131 {
3132 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3133
3134 slp_instance instance;
3135 int i;
3136 poly_uint64 min_vf = 2;
3137
3138 /* The first group of checks is independent of the vector size. */
3139 fatal = true;
3140
3141 /* Analyze the data references. */
3142
3143 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3144 {
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147 "not vectorized: unhandled data-ref in basic "
3148 "block.\n");
3149 return false;
3150 }
3151
3152 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3153 {
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 "not vectorized: unhandled data access in "
3157 "basic block.\n");
3158 return false;
3159 }
3160
3161 vect_slp_check_for_constructors (bb_vinfo);
3162
3163 /* If there are no grouped stores and no constructors in the region
3164 there is no need to continue with pattern recog as vect_analyze_slp
3165 will fail anyway. */
3166 if (bb_vinfo->grouped_stores.is_empty ())
3167 {
3168 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3170 "not vectorized: no grouped stores in "
3171 "basic block.\n");
3172 return false;
3173 }
3174
3175 /* While the rest of the analysis below depends on it in some way. */
3176 fatal = false;
3177
3178 vect_pattern_recog (bb_vinfo);
3179
3180 /* Check the SLP opportunities in the basic block, analyze and build SLP
3181 trees. */
3182 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3183 {
3184 if (dump_enabled_p ())
3185 {
3186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3187 "Failed to SLP the basic block.\n");
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: failed to find SLP opportunities "
3190 "in basic block.\n");
3191 }
3192 return false;
3193 }
3194
3195 /* Optimize permutations. */
3196 vect_optimize_slp (bb_vinfo);
3197
3198 vect_record_base_alignments (bb_vinfo);
3199
3200 /* Analyze and verify the alignment of data references and the
3201 dependence in the SLP instances. */
3202 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3203 {
3204 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3205 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3206 {
3207 slp_tree node = SLP_INSTANCE_TREE (instance);
3208 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_NOTE, vect_location,
3211 "removing SLP instance operations starting from: %G",
3212 stmt_info->stmt);
3213 vect_free_slp_instance (instance, false);
3214 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3215 continue;
3216 }
3217
3218 /* Mark all the statements that we want to vectorize as pure SLP and
3219 relevant. */
3220 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3221 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3222 if (SLP_INSTANCE_ROOT_STMT (instance))
3223 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3224
3225 i++;
3226 }
3227 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3228 return false;
3229
3230 if (!vect_slp_analyze_operations (bb_vinfo))
3231 {
3232 if (dump_enabled_p ())
3233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3234 "not vectorized: bad operation in basic block.\n");
3235 return false;
3236 }
3237
3238 /* Cost model: check if the vectorization is worthwhile. */
3239 if (!unlimited_cost_model (NULL)
3240 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3241 {
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3244 "not vectorized: vectorization is not "
3245 "profitable.\n");
3246 return false;
3247 }
3248
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "Basic block will be vectorized using SLP\n");
3252 return true;
3253 }
3254
3255 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3256 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3257 on success. The region has N_STMTS statements and has the datarefs
3258 given by DATAREFS. */
3259
3260 static bool
3261 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3262 gimple_stmt_iterator region_end,
3263 vec<data_reference_p> datarefs,
3264 unsigned int n_stmts)
3265 {
3266 bb_vec_info bb_vinfo;
3267 auto_vector_modes vector_modes;
3268
3269 /* Autodetect first vector size we try. */
3270 machine_mode next_vector_mode = VOIDmode;
3271 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3272 unsigned int mode_i = 0;
3273
3274 vec_info_shared shared;
3275
3276 machine_mode autodetected_vector_mode = VOIDmode;
3277 while (1)
3278 {
3279 bool vectorized = false;
3280 bool fatal = false;
3281 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3282
3283 bool first_time_p = shared.datarefs.is_empty ();
3284 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3285 if (first_time_p)
3286 bb_vinfo->shared->save_datarefs ();
3287 else
3288 bb_vinfo->shared->check_datarefs ();
3289 bb_vinfo->vector_mode = next_vector_mode;
3290
3291 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3292 && dbg_cnt (vect_slp))
3293 {
3294 if (dump_enabled_p ())
3295 {
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "***** Analysis succeeded with vector mode"
3298 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3299 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3300 }
3301
3302 bb_vinfo->shared->check_datarefs ();
3303 vect_schedule_slp (bb_vinfo);
3304
3305 unsigned HOST_WIDE_INT bytes;
3306 if (dump_enabled_p ())
3307 {
3308 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3309 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3310 "basic block part vectorized using %wu byte "
3311 "vectors\n", bytes);
3312 else
3313 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3314 "basic block part vectorized using variable "
3315 "length vectors\n");
3316 }
3317
3318 vectorized = true;
3319 }
3320 else
3321 {
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location,
3324 "***** Analysis failed with vector mode %s\n",
3325 GET_MODE_NAME (bb_vinfo->vector_mode));
3326 }
3327
3328 if (mode_i == 0)
3329 autodetected_vector_mode = bb_vinfo->vector_mode;
3330
3331 if (!fatal)
3332 while (mode_i < vector_modes.length ()
3333 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3334 {
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE, vect_location,
3337 "***** The result for vector mode %s would"
3338 " be the same\n",
3339 GET_MODE_NAME (vector_modes[mode_i]));
3340 mode_i += 1;
3341 }
3342
3343 delete bb_vinfo;
3344
3345 if (mode_i < vector_modes.length ()
3346 && VECTOR_MODE_P (autodetected_vector_mode)
3347 && (related_vector_mode (vector_modes[mode_i],
3348 GET_MODE_INNER (autodetected_vector_mode))
3349 == autodetected_vector_mode)
3350 && (related_vector_mode (autodetected_vector_mode,
3351 GET_MODE_INNER (vector_modes[mode_i]))
3352 == vector_modes[mode_i]))
3353 {
3354 if (dump_enabled_p ())
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "***** Skipping vector mode %s, which would"
3357 " repeat the analysis for %s\n",
3358 GET_MODE_NAME (vector_modes[mode_i]),
3359 GET_MODE_NAME (autodetected_vector_mode));
3360 mode_i += 1;
3361 }
3362
3363 if (vectorized
3364 || mode_i == vector_modes.length ()
3365 || autodetected_vector_mode == VOIDmode
3366 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3367 vector sizes will fail do not bother iterating. */
3368 || fatal)
3369 return vectorized;
3370
3371 /* Try the next biggest vector size. */
3372 next_vector_mode = vector_modes[mode_i++];
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "***** Re-trying analysis with vector mode %s\n",
3376 GET_MODE_NAME (next_vector_mode));
3377 }
3378 }
3379
3380 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3381 true if anything in the basic-block was vectorized. */
3382
3383 bool
3384 vect_slp_bb (basic_block bb)
3385 {
3386 gimple_stmt_iterator gsi;
3387 bool any_vectorized = false;
3388
3389 gsi = gsi_after_labels (bb);
3390 while (!gsi_end_p (gsi))
3391 {
3392 gimple_stmt_iterator region_begin = gsi;
3393 vec<data_reference_p> datarefs = vNULL;
3394 int insns = 0;
3395
3396 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3397 {
3398 gimple *stmt = gsi_stmt (gsi);
3399 if (is_gimple_debug (stmt))
3400 {
3401 /* Skip leading debug stmts. */
3402 if (gsi_stmt (region_begin) == stmt)
3403 gsi_next (&region_begin);
3404 continue;
3405 }
3406 insns++;
3407
3408 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3409 vect_location = stmt;
3410
3411 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3412 break;
3413 }
3414 if (gsi_end_p (region_begin))
3415 break;
3416
3417 /* Skip leading unhandled stmts. */
3418 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3419 {
3420 gsi_next (&gsi);
3421 continue;
3422 }
3423
3424 gimple_stmt_iterator region_end = gsi;
3425
3426 if (insns > param_slp_max_insns_in_bb)
3427 {
3428 if (dump_enabled_p ())
3429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3430 "not vectorized: too many instructions in "
3431 "basic block.\n");
3432 }
3433 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3434 any_vectorized = true;
3435
3436 if (gsi_end_p (region_end))
3437 break;
3438
3439 /* Skip the unhandled stmt. */
3440 gsi_next (&gsi);
3441 }
3442
3443 return any_vectorized;
3444 }
3445
3446
3447 /* Build a variable-length vector in which the elements in ELTS are repeated
3448 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3449 RESULTS and add any new instructions to SEQ.
3450
3451 The approach we use is:
3452
3453 (1) Find a vector mode VM with integer elements of mode IM.
3454
3455 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3456 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3457 from small vectors to IM.
3458
3459 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3460
3461 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3462 correct byte contents.
3463
3464 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3465
3466 We try to find the largest IM for which this sequence works, in order
3467 to cut down on the number of interleaves. */
3468
3469 void
3470 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3471 vec<tree> elts, unsigned int nresults,
3472 vec<tree> &results)
3473 {
3474 unsigned int nelts = elts.length ();
3475 tree element_type = TREE_TYPE (vector_type);
3476
3477 /* (1) Find a vector mode VM with integer elements of mode IM. */
3478 unsigned int nvectors = 1;
3479 tree new_vector_type;
3480 tree permutes[2];
3481 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3482 &nvectors, &new_vector_type,
3483 permutes))
3484 gcc_unreachable ();
3485
3486 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3487 unsigned int partial_nelts = nelts / nvectors;
3488 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3489
3490 tree_vector_builder partial_elts;
3491 auto_vec<tree, 32> pieces (nvectors * 2);
3492 pieces.quick_grow (nvectors * 2);
3493 for (unsigned int i = 0; i < nvectors; ++i)
3494 {
3495 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3496 ELTS' has mode IM. */
3497 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3498 for (unsigned int j = 0; j < partial_nelts; ++j)
3499 partial_elts.quick_push (elts[i * partial_nelts + j]);
3500 tree t = gimple_build_vector (seq, &partial_elts);
3501 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3502 TREE_TYPE (new_vector_type), t);
3503
3504 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3505 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3506 }
3507
3508 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3509 correct byte contents.
3510
3511 We need to repeat the following operation log2(nvectors) times:
3512
3513 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3514 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3515
3516 However, if each input repeats every N elements and the VF is
3517 a multiple of N * 2, the HI result is the same as the LO. */
3518 unsigned int in_start = 0;
3519 unsigned int out_start = nvectors;
3520 unsigned int hi_start = nvectors / 2;
3521 /* A bound on the number of outputs needed to produce NRESULTS results
3522 in the final iteration. */
3523 unsigned int noutputs_bound = nvectors * nresults;
3524 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3525 {
3526 noutputs_bound /= 2;
3527 unsigned int limit = MIN (noutputs_bound, nvectors);
3528 for (unsigned int i = 0; i < limit; ++i)
3529 {
3530 if ((i & 1) != 0
3531 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3532 2 * in_repeat))
3533 {
3534 pieces[out_start + i] = pieces[out_start + i - 1];
3535 continue;
3536 }
3537
3538 tree output = make_ssa_name (new_vector_type);
3539 tree input1 = pieces[in_start + (i / 2)];
3540 tree input2 = pieces[in_start + (i / 2) + hi_start];
3541 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3542 input1, input2,
3543 permutes[i & 1]);
3544 gimple_seq_add_stmt (seq, stmt);
3545 pieces[out_start + i] = output;
3546 }
3547 std::swap (in_start, out_start);
3548 }
3549
3550 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3551 results.reserve (nresults);
3552 for (unsigned int i = 0; i < nresults; ++i)
3553 if (i < nvectors)
3554 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3555 pieces[in_start + i]));
3556 else
3557 results.quick_push (results[i - nvectors]);
3558 }
3559
3560
3561 /* For constant and loop invariant defs in OP_NODE this function creates
3562 vector defs that will be used in the vectorized stmts and stores them
3563 to SLP_TREE_VEC_DEFS of OP_NODE. */
3564
3565 static void
3566 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3567 {
3568 unsigned HOST_WIDE_INT nunits;
3569 tree vec_cst;
3570 unsigned j, number_of_places_left_in_vector;
3571 tree vector_type;
3572 tree vop;
3573 int group_size = op_node->ops.length ();
3574 unsigned int vec_num, i;
3575 unsigned number_of_copies = 1;
3576 bool constant_p;
3577 gimple_seq ctor_seq = NULL;
3578 auto_vec<tree, 16> permute_results;
3579
3580 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3581 vector_type = SLP_TREE_VECTYPE (op_node);
3582
3583 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3584 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3585 auto_vec<tree> voprnds (number_of_vectors);
3586
3587 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3588 created vectors. It is greater than 1 if unrolling is performed.
3589
3590 For example, we have two scalar operands, s1 and s2 (e.g., group of
3591 strided accesses of size two), while NUNITS is four (i.e., four scalars
3592 of this type can be packed in a vector). The output vector will contain
3593 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3594 will be 2).
3595
3596 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3597 containing the operands.
3598
3599 For example, NUNITS is four as before, and the group size is 8
3600 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3601 {s5, s6, s7, s8}. */
3602
3603 /* When using duplicate_and_interleave, we just need one element for
3604 each scalar statement. */
3605 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3606 nunits = group_size;
3607
3608 number_of_copies = nunits * number_of_vectors / group_size;
3609
3610 number_of_places_left_in_vector = nunits;
3611 constant_p = true;
3612 tree_vector_builder elts (vector_type, nunits, 1);
3613 elts.quick_grow (nunits);
3614 stmt_vec_info insert_after = NULL;
3615 for (j = 0; j < number_of_copies; j++)
3616 {
3617 tree op;
3618 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3619 {
3620 /* Create 'vect_ = {op0,op1,...,opn}'. */
3621 number_of_places_left_in_vector--;
3622 tree orig_op = op;
3623 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3624 {
3625 if (CONSTANT_CLASS_P (op))
3626 {
3627 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3628 {
3629 /* Can't use VIEW_CONVERT_EXPR for booleans because
3630 of possibly different sizes of scalar value and
3631 vector element. */
3632 if (integer_zerop (op))
3633 op = build_int_cst (TREE_TYPE (vector_type), 0);
3634 else if (integer_onep (op))
3635 op = build_all_ones_cst (TREE_TYPE (vector_type));
3636 else
3637 gcc_unreachable ();
3638 }
3639 else
3640 op = fold_unary (VIEW_CONVERT_EXPR,
3641 TREE_TYPE (vector_type), op);
3642 gcc_assert (op && CONSTANT_CLASS_P (op));
3643 }
3644 else
3645 {
3646 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3647 gimple *init_stmt;
3648 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3649 {
3650 tree true_val
3651 = build_all_ones_cst (TREE_TYPE (vector_type));
3652 tree false_val
3653 = build_zero_cst (TREE_TYPE (vector_type));
3654 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3655 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3656 op, true_val,
3657 false_val);
3658 }
3659 else
3660 {
3661 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3662 op);
3663 init_stmt
3664 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3665 op);
3666 }
3667 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3668 op = new_temp;
3669 }
3670 }
3671 elts[number_of_places_left_in_vector] = op;
3672 if (!CONSTANT_CLASS_P (op))
3673 constant_p = false;
3674 /* For BB vectorization we have to compute an insert location
3675 when a def is inside the analyzed region since we cannot
3676 simply insert at the BB start in this case. */
3677 stmt_vec_info opdef;
3678 if (TREE_CODE (orig_op) == SSA_NAME
3679 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3680 && is_a <bb_vec_info> (vinfo)
3681 && (opdef = vinfo->lookup_def (orig_op)))
3682 {
3683 if (!insert_after)
3684 insert_after = opdef;
3685 else
3686 insert_after = get_later_stmt (insert_after, opdef);
3687 }
3688
3689 if (number_of_places_left_in_vector == 0)
3690 {
3691 if (constant_p
3692 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3693 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3694 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3695 else
3696 {
3697 if (permute_results.is_empty ())
3698 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3699 elts, number_of_vectors,
3700 permute_results);
3701 vec_cst = permute_results[number_of_vectors - j - 1];
3702 }
3703 tree init;
3704 if (insert_after)
3705 {
3706 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
3707 /* vect_init_vector inserts before. */
3708 gsi_next (&gsi);
3709 init = vect_init_vector (vinfo, NULL, vec_cst,
3710 vector_type, &gsi);
3711 }
3712 else
3713 init = vect_init_vector (vinfo, NULL, vec_cst,
3714 vector_type, NULL);
3715 if (ctor_seq != NULL)
3716 {
3717 gimple_stmt_iterator gsi
3718 = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3719 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3720 ctor_seq = NULL;
3721 }
3722 voprnds.quick_push (init);
3723 insert_after = NULL;
3724 number_of_places_left_in_vector = nunits;
3725 constant_p = true;
3726 elts.new_vector (vector_type, nunits, 1);
3727 elts.quick_grow (nunits);
3728 }
3729 }
3730 }
3731
3732 /* Since the vectors are created in the reverse order, we should invert
3733 them. */
3734 vec_num = voprnds.length ();
3735 for (j = vec_num; j != 0; j--)
3736 {
3737 vop = voprnds[j - 1];
3738 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3739 }
3740
3741 /* In case that VF is greater than the unrolling factor needed for the SLP
3742 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3743 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3744 to replicate the vectors. */
3745 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3746 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3747 i++)
3748 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3749 }
3750
3751 /* Get the Ith vectorized definition from SLP_NODE. */
3752
3753 tree
3754 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3755 {
3756 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3757 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3758 else
3759 return SLP_TREE_VEC_DEFS (slp_node)[i];
3760 }
3761
3762 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3763
3764 void
3765 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3766 {
3767 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3768 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3769 {
3770 unsigned j;
3771 gimple *vec_def_stmt;
3772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3773 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3774 }
3775 else
3776 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3777 }
3778
3779 /* Get N vectorized definitions for SLP_NODE. */
3780
3781 void
3782 vect_get_slp_defs (vec_info *,
3783 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3784 {
3785 if (n == -1U)
3786 n = SLP_TREE_CHILDREN (slp_node).length ();
3787
3788 for (unsigned i = 0; i < n; ++i)
3789 {
3790 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3791 vec<tree> vec_defs = vNULL;
3792 vect_get_slp_defs (child, &vec_defs);
3793 vec_oprnds->quick_push (vec_defs);
3794 }
3795 }
3796
3797 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3798 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3799 permute statements for the SLP node NODE. */
3800
3801 bool
3802 vect_transform_slp_perm_load (vec_info *vinfo,
3803 slp_tree node, vec<tree> dr_chain,
3804 gimple_stmt_iterator *gsi, poly_uint64 vf,
3805 bool analyze_only, unsigned *n_perms)
3806 {
3807 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3808 int vec_index = 0;
3809 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3810 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3811 unsigned int mask_element;
3812 machine_mode mode;
3813
3814 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3815 return false;
3816
3817 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3818
3819 mode = TYPE_MODE (vectype);
3820 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3821
3822 /* Initialize the vect stmts of NODE to properly insert the generated
3823 stmts later. */
3824 if (! analyze_only)
3825 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3826 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3827 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3828
3829 /* Generate permutation masks for every NODE. Number of masks for each NODE
3830 is equal to GROUP_SIZE.
3831 E.g., we have a group of three nodes with three loads from the same
3832 location in each node, and the vector size is 4. I.e., we have a
3833 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3834 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3835 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3836 ...
3837
3838 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3839 The last mask is illegal since we assume two operands for permute
3840 operation, and the mask element values can't be outside that range.
3841 Hence, the last mask must be converted into {2,5,5,5}.
3842 For the first two permutations we need the first and the second input
3843 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3844 we need the second and the third vectors: {b1,c1,a2,b2} and
3845 {c2,a3,b3,c3}. */
3846
3847 int vect_stmts_counter = 0;
3848 unsigned int index = 0;
3849 int first_vec_index = -1;
3850 int second_vec_index = -1;
3851 bool noop_p = true;
3852 *n_perms = 0;
3853
3854 vec_perm_builder mask;
3855 unsigned int nelts_to_build;
3856 unsigned int nvectors_per_build;
3857 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3858 && multiple_p (nunits, group_size));
3859 if (repeating_p)
3860 {
3861 /* A single vector contains a whole number of copies of the node, so:
3862 (a) all permutes can use the same mask; and
3863 (b) the permutes only need a single vector input. */
3864 mask.new_vector (nunits, group_size, 3);
3865 nelts_to_build = mask.encoded_nelts ();
3866 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3867 }
3868 else
3869 {
3870 /* We need to construct a separate mask for each vector statement. */
3871 unsigned HOST_WIDE_INT const_nunits, const_vf;
3872 if (!nunits.is_constant (&const_nunits)
3873 || !vf.is_constant (&const_vf))
3874 return false;
3875 mask.new_vector (const_nunits, const_nunits, 1);
3876 nelts_to_build = const_vf * group_size;
3877 nvectors_per_build = 1;
3878 }
3879
3880 unsigned int count = mask.encoded_nelts ();
3881 mask.quick_grow (count);
3882 vec_perm_indices indices;
3883
3884 for (unsigned int j = 0; j < nelts_to_build; j++)
3885 {
3886 unsigned int iter_num = j / group_size;
3887 unsigned int stmt_num = j % group_size;
3888 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3889 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3890 if (repeating_p)
3891 {
3892 first_vec_index = 0;
3893 mask_element = i;
3894 }
3895 else
3896 {
3897 /* Enforced before the loop when !repeating_p. */
3898 unsigned int const_nunits = nunits.to_constant ();
3899 vec_index = i / const_nunits;
3900 mask_element = i % const_nunits;
3901 if (vec_index == first_vec_index
3902 || first_vec_index == -1)
3903 {
3904 first_vec_index = vec_index;
3905 }
3906 else if (vec_index == second_vec_index
3907 || second_vec_index == -1)
3908 {
3909 second_vec_index = vec_index;
3910 mask_element += const_nunits;
3911 }
3912 else
3913 {
3914 if (dump_enabled_p ())
3915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3916 "permutation requires at "
3917 "least three vectors %G",
3918 stmt_info->stmt);
3919 gcc_assert (analyze_only);
3920 return false;
3921 }
3922
3923 gcc_assert (mask_element < 2 * const_nunits);
3924 }
3925
3926 if (mask_element != index)
3927 noop_p = false;
3928 mask[index++] = mask_element;
3929
3930 if (index == count && !noop_p)
3931 {
3932 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3933 if (!can_vec_perm_const_p (mode, indices))
3934 {
3935 if (dump_enabled_p ())
3936 {
3937 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3938 vect_location,
3939 "unsupported vect permute { ");
3940 for (i = 0; i < count; ++i)
3941 {
3942 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3943 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3944 }
3945 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3946 }
3947 gcc_assert (analyze_only);
3948 return false;
3949 }
3950
3951 ++*n_perms;
3952 }
3953
3954 if (index == count)
3955 {
3956 if (!analyze_only)
3957 {
3958 tree mask_vec = NULL_TREE;
3959
3960 if (! noop_p)
3961 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3962
3963 if (second_vec_index == -1)
3964 second_vec_index = first_vec_index;
3965
3966 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3967 {
3968 /* Generate the permute statement if necessary. */
3969 tree first_vec = dr_chain[first_vec_index + ri];
3970 tree second_vec = dr_chain[second_vec_index + ri];
3971 gimple *perm_stmt;
3972 if (! noop_p)
3973 {
3974 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3975 tree perm_dest
3976 = vect_create_destination_var (gimple_assign_lhs (stmt),
3977 vectype);
3978 perm_dest = make_ssa_name (perm_dest);
3979 perm_stmt
3980 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3981 first_vec, second_vec,
3982 mask_vec);
3983 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
3984 gsi);
3985 }
3986 else
3987 /* If mask was NULL_TREE generate the requested
3988 identity transform. */
3989 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3990
3991 /* Store the vector statement in NODE. */
3992 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3993 }
3994 }
3995
3996 index = 0;
3997 first_vec_index = -1;
3998 second_vec_index = -1;
3999 noop_p = true;
4000 }
4001 }
4002
4003 return true;
4004 }
4005
4006
4007 /* Vectorize the SLP permutations in NODE as specified
4008 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4009 child number and lane number.
4010 Interleaving of two two-lane two-child SLP subtrees (not supported):
4011 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4012 A blend of two four-lane two-child SLP subtrees:
4013 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4014 Highpart of a four-lane one-child SLP subtree (not supported):
4015 [ { 0, 2 }, { 0, 3 } ]
4016 Where currently only a subset is supported by code generating below. */
4017
4018 static bool
4019 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4020 slp_tree node, stmt_vector_for_cost *cost_vec)
4021 {
4022 tree vectype = SLP_TREE_VECTYPE (node);
4023
4024 /* ??? We currently only support all same vector input and output types
4025 while the SLP IL should really do a concat + select and thus accept
4026 arbitrary mismatches. */
4027 slp_tree child;
4028 unsigned i;
4029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4030 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4031 {
4032 if (dump_enabled_p ())
4033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4034 "Unsupported lane permutation\n");
4035 return false;
4036 }
4037
4038 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4039 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4040 if (dump_enabled_p ())
4041 {
4042 dump_printf_loc (MSG_NOTE, vect_location,
4043 "vectorizing permutation");
4044 for (unsigned i = 0; i < perm.length (); ++i)
4045 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4046 dump_printf (MSG_NOTE, "\n");
4047 }
4048
4049 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4050 if (!nunits.is_constant ())
4051 return false;
4052 unsigned HOST_WIDE_INT vf = 1;
4053 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4054 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4055 return false;
4056 unsigned olanes = vf * SLP_TREE_LANES (node);
4057 gcc_assert (multiple_p (olanes, nunits));
4058
4059 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4060 from the { SLP operand, scalar lane } permutation as recorded in the
4061 SLP node as intermediate step. This part should already work
4062 with SLP children with arbitrary number of lanes. */
4063 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4064 auto_vec<unsigned> active_lane;
4065 vperm.create (olanes);
4066 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4067 for (unsigned i = 0; i < vf; ++i)
4068 {
4069 for (unsigned pi = 0; pi < perm.length (); ++pi)
4070 {
4071 std::pair<unsigned, unsigned> p = perm[pi];
4072 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4073 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4074 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4075 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4076 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4077 }
4078 /* Advance to the next group. */
4079 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4080 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4081 }
4082
4083 if (dump_enabled_p ())
4084 {
4085 dump_printf_loc (MSG_NOTE, vect_location, "as");
4086 for (unsigned i = 0; i < vperm.length (); ++i)
4087 {
4088 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4089 dump_printf (MSG_NOTE, ",");
4090 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4091 vperm[i].first.first, vperm[i].first.second,
4092 vperm[i].first.second);
4093 }
4094 dump_printf (MSG_NOTE, "\n");
4095 }
4096
4097 /* We can only handle two-vector permutes, everything else should
4098 be lowered on the SLP level. The following is closely inspired
4099 by vect_transform_slp_perm_load and is supposed to eventually
4100 replace it.
4101 ??? As intermediate step do code-gen in the SLP tree representation
4102 somehow? */
4103 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4104 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4105 unsigned int const_nunits = nunits.to_constant ();
4106 unsigned int index = 0;
4107 unsigned int mask_element;
4108 vec_perm_builder mask;
4109 mask.new_vector (const_nunits, const_nunits, 1);
4110 unsigned int count = mask.encoded_nelts ();
4111 mask.quick_grow (count);
4112 vec_perm_indices indices;
4113 unsigned nperms = 0;
4114 for (unsigned i = 0; i < vperm.length (); ++i)
4115 {
4116 mask_element = vperm[i].second;
4117 if (first_vec.first == -1U
4118 || first_vec == vperm[i].first)
4119 first_vec = vperm[i].first;
4120 else if (second_vec.first == -1U
4121 || second_vec == vperm[i].first)
4122 {
4123 second_vec = vperm[i].first;
4124 mask_element += const_nunits;
4125 }
4126 else
4127 {
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130 "permutation requires at "
4131 "least three vectors");
4132 gcc_assert (!gsi);
4133 return false;
4134 }
4135
4136 mask[index++] = mask_element;
4137
4138 if (index == count)
4139 {
4140 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4141 const_nunits);
4142 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4143 {
4144 if (dump_enabled_p ())
4145 {
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4147 vect_location,
4148 "unsupported vect permute { ");
4149 for (i = 0; i < count; ++i)
4150 {
4151 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4152 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4153 }
4154 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4155 }
4156 gcc_assert (!gsi);
4157 return false;
4158 }
4159
4160 nperms++;
4161 if (gsi)
4162 {
4163 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4164
4165 if (second_vec.first == -1U)
4166 second_vec = first_vec;
4167
4168 /* Generate the permute statement if necessary. */
4169 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4170 tree first_def
4171 = vect_get_slp_vect_def (first_node, first_vec.second);
4172 slp_tree second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
4173 tree second_def
4174 = vect_get_slp_vect_def (second_node, second_vec.second);
4175 tree perm_dest = make_ssa_name (vectype);
4176 gassign *perm_stmt
4177 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4178 first_def, second_def,
4179 mask_vec);
4180 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4181 /* Store the vector statement in NODE. */
4182 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4183 }
4184
4185 index = 0;
4186 first_vec = std::make_pair (-1U, -1U);
4187 second_vec = std::make_pair (-1U, -1U);
4188 }
4189 }
4190
4191 if (!gsi)
4192 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4193
4194 return true;
4195 }
4196
4197 /* Vectorize SLP instance tree in postorder. */
4198
4199 static void
4200 vect_schedule_slp_instance (vec_info *vinfo,
4201 slp_tree node, slp_instance instance)
4202 {
4203 gimple_stmt_iterator si;
4204 int i;
4205 slp_tree child;
4206
4207 /* See if we have already vectorized the node in the graph of the
4208 SLP instance. */
4209 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4210 && SLP_TREE_VEC_STMTS (node).exists ())
4211 || SLP_TREE_VEC_DEFS (node).exists ())
4212 return;
4213
4214 /* Vectorize externals and constants. */
4215 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4216 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4217 {
4218 /* ??? vectorizable_shift can end up using a scalar operand which is
4219 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4220 node in this case. */
4221 if (!SLP_TREE_VECTYPE (node))
4222 return;
4223
4224 vect_create_constant_vectors (vinfo, node);
4225 return;
4226 }
4227
4228 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4229 vect_schedule_slp_instance (vinfo, child, instance);
4230
4231 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4232 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4233
4234 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4235 if (dump_enabled_p ())
4236 dump_printf_loc (MSG_NOTE, vect_location,
4237 "------>vectorizing SLP node starting from: %G",
4238 stmt_info->stmt);
4239
4240 if (STMT_VINFO_DATA_REF (stmt_info)
4241 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4242 {
4243 /* Vectorized loads go before the first scalar load to make it
4244 ready early, vectorized stores go before the last scalar
4245 stmt which is where all uses are ready. */
4246 stmt_vec_info last_stmt_info = NULL;
4247 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4248 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4249 else /* DR_IS_WRITE */
4250 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4251 si = gsi_for_stmt (last_stmt_info->stmt);
4252 }
4253 else if (SLP_TREE_CHILDREN (node).is_empty ())
4254 {
4255 /* This happens for reduction and induction PHIs where we do not use the
4256 insertion iterator. */
4257 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4258 == cycle_phi_info_type
4259 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4260 == induc_vec_info_type));
4261 si = gsi_none ();
4262 }
4263 else
4264 {
4265 /* Emit other stmts after the children vectorized defs which is
4266 earliest possible. */
4267 gimple *last_stmt = NULL;
4268 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4269 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4270 {
4271 /* For fold-left reductions we are retaining the scalar
4272 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4273 set so the representation isn't perfect. Resort to the
4274 last scalar def here. */
4275 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4276 {
4277 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4278 == cycle_phi_info_type);
4279 gphi *phi = as_a <gphi *>
4280 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4281 if (!last_stmt
4282 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4283 last_stmt = phi;
4284 }
4285 /* We are emitting all vectorized stmts in the same place and
4286 the last one is the last.
4287 ??? Unless we have a load permutation applied and that
4288 figures to re-use an earlier generated load. */
4289 unsigned j;
4290 gimple *vstmt;
4291 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4292 if (!last_stmt
4293 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4294 last_stmt = vstmt;
4295 }
4296 else if (!SLP_TREE_VECTYPE (child))
4297 {
4298 /* For externals we use unvectorized at all scalar defs. */
4299 unsigned j;
4300 tree def;
4301 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4302 if (TREE_CODE (def) == SSA_NAME
4303 && !SSA_NAME_IS_DEFAULT_DEF (def))
4304 {
4305 gimple *stmt = SSA_NAME_DEF_STMT (def);
4306 if (!last_stmt
4307 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4308 last_stmt = stmt;
4309 }
4310 }
4311 else
4312 {
4313 /* For externals we have to look at all defs since their
4314 insertion place is decided per vector. */
4315 unsigned j;
4316 tree vdef;
4317 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4318 if (TREE_CODE (vdef) == SSA_NAME)
4319 {
4320 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4321 if (!last_stmt
4322 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4323 last_stmt = vstmt;
4324 }
4325 }
4326 if (is_a <gphi *> (last_stmt))
4327 si = gsi_after_labels (gimple_bb (last_stmt));
4328 else
4329 {
4330 si = gsi_for_stmt (last_stmt);
4331 gsi_next (&si);
4332 }
4333 }
4334
4335 bool done_p = false;
4336
4337 /* Handle purely internal nodes. */
4338 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4339 {
4340 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4341 be shared with different SLP nodes (but usually it's the same
4342 operation apart from the case the stmt is only there for denoting
4343 the actual scalar lane defs ...). So do not call vect_transform_stmt
4344 but open-code it here (partly). */
4345 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4346 gcc_assert (done);
4347 done_p = true;
4348 }
4349 if (!done_p)
4350 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4351 }
4352
4353 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4354 For loop vectorization this is done in vectorizable_call, but for SLP
4355 it needs to be deferred until end of vect_schedule_slp, because multiple
4356 SLP instances may refer to the same scalar stmt. */
4357
4358 static void
4359 vect_remove_slp_scalar_calls (vec_info *vinfo,
4360 slp_tree node, hash_set<slp_tree> &visited)
4361 {
4362 gimple *new_stmt;
4363 gimple_stmt_iterator gsi;
4364 int i;
4365 slp_tree child;
4366 tree lhs;
4367 stmt_vec_info stmt_info;
4368
4369 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4370 return;
4371
4372 if (visited.add (node))
4373 return;
4374
4375 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4376 vect_remove_slp_scalar_calls (vinfo, child, visited);
4377
4378 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4379 {
4380 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4381 if (!stmt || gimple_bb (stmt) == NULL)
4382 continue;
4383 if (is_pattern_stmt_p (stmt_info)
4384 || !PURE_SLP_STMT (stmt_info))
4385 continue;
4386 lhs = gimple_call_lhs (stmt);
4387 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4388 gsi = gsi_for_stmt (stmt);
4389 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4390 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4391 }
4392 }
4393
4394 static void
4395 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4396 {
4397 hash_set<slp_tree> visited;
4398 vect_remove_slp_scalar_calls (vinfo, node, visited);
4399 }
4400
4401 /* Vectorize the instance root. */
4402
4403 void
4404 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4405 {
4406 gassign *rstmt = NULL;
4407
4408 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4409 {
4410 gimple *child_stmt;
4411 int j;
4412
4413 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4414 {
4415 tree vect_lhs = gimple_get_lhs (child_stmt);
4416 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4417 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4418 TREE_TYPE (vect_lhs)))
4419 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4420 vect_lhs);
4421 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4422 break;
4423 }
4424 }
4425 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4426 {
4427 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4428 gimple *child_stmt;
4429 int j;
4430 vec<constructor_elt, va_gc> *v;
4431 vec_alloc (v, nelts);
4432
4433 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4434 {
4435 CONSTRUCTOR_APPEND_ELT (v,
4436 NULL_TREE,
4437 gimple_get_lhs (child_stmt));
4438 }
4439 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4440 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4441 tree r_constructor = build_constructor (rtype, v);
4442 rstmt = gimple_build_assign (lhs, r_constructor);
4443 }
4444
4445 gcc_assert (rstmt);
4446
4447 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4448 gsi_replace (&rgsi, rstmt, true);
4449 }
4450
4451 /* Generate vector code for all SLP instances in the loop/basic block. */
4452
4453 void
4454 vect_schedule_slp (vec_info *vinfo)
4455 {
4456 vec<slp_instance> slp_instances;
4457 slp_instance instance;
4458 unsigned int i;
4459
4460 slp_instances = vinfo->slp_instances;
4461 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4462 {
4463 slp_tree node = SLP_INSTANCE_TREE (instance);
4464 /* Schedule the tree of INSTANCE. */
4465 vect_schedule_slp_instance (vinfo, node, instance);
4466
4467 if (SLP_INSTANCE_ROOT_STMT (instance))
4468 vectorize_slp_instance_root_stmt (node, instance);
4469
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "vectorizing stmts using SLP.\n");
4473 }
4474
4475 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4476 {
4477 slp_tree root = SLP_INSTANCE_TREE (instance);
4478 stmt_vec_info store_info;
4479 unsigned int j;
4480
4481 /* Remove scalar call stmts. Do not do this for basic-block
4482 vectorization as not all uses may be vectorized.
4483 ??? Why should this be necessary? DCE should be able to
4484 remove the stmts itself.
4485 ??? For BB vectorization we can as well remove scalar
4486 stmts starting from the SLP tree root if they have no
4487 uses. */
4488 if (is_a <loop_vec_info> (vinfo))
4489 vect_remove_slp_scalar_calls (vinfo, root);
4490
4491 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4492 {
4493 if (!STMT_VINFO_DATA_REF (store_info))
4494 break;
4495
4496 if (SLP_INSTANCE_ROOT_STMT (instance))
4497 continue;
4498
4499 store_info = vect_orig_stmt (store_info);
4500 /* Free the attached stmt_vec_info and remove the stmt. */
4501 vinfo->remove_stmt (store_info);
4502 }
4503 }
4504 }