refactor SLP constant insertion and provde entry insert helper
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
52
53 /* Initialize a SLP node. */
54
55 _slp_tree::_slp_tree ()
56 {
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
72 }
73
74 /* Tear down a SLP node. */
75
76 _slp_tree::~_slp_tree ()
77 {
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
85 }
86
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
90
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
93 {
94 int i;
95 slp_tree child;
96
97 if (--node->refcnt != 0)
98 return;
99
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
102
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
108 {
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
111 {
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
114 }
115 }
116
117 delete node;
118 }
119
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
123
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
126 {
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 free (instance);
130 }
131
132
133 /* Create an SLP node for SCALAR_STMTS. */
134
135 static slp_tree
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
137 {
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
144
145 unsigned i;
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
149
150 return node;
151 }
152
153 /* Create an SLP node for OPS. */
154
155 static slp_tree
156 vect_create_new_slp_node (vec<tree> ops)
157 {
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
182
183
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
188 {
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
192
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
195 {
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
203 }
204
205 return oprnds_info;
206 }
207
208
209 /* Free operands info. */
210
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 {
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
222 }
223
224 oprnds_info.release ();
225 }
226
227
228 /* Return true if STMTS contains a pattern statement. */
229
230 static bool
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 {
233 stmt_vec_info stmt_info;
234 unsigned int i;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
237 return true;
238 return false;
239 }
240
241 /* Return true when all lanes in the external or constant NODE have
242 the same value. */
243
244 static bool
245 vect_slp_tree_uniform_p (slp_tree node)
246 {
247 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
249
250 /* Pre-exsting vectors. */
251 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
252 return false;
253
254 unsigned i;
255 tree op, first = NULL_TREE;
256 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
257 if (!first)
258 first = op;
259 else if (!operand_equal_p (first, op, 0))
260 return false;
261
262 return true;
263 }
264
265 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
266 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
267 of the chain. */
268
269 int
270 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
271 stmt_vec_info first_stmt_info)
272 {
273 stmt_vec_info next_stmt_info = first_stmt_info;
274 int result = 0;
275
276 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
277 return -1;
278
279 do
280 {
281 if (next_stmt_info == stmt_info)
282 return result;
283 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
284 if (next_stmt_info)
285 result += DR_GROUP_GAP (next_stmt_info);
286 }
287 while (next_stmt_info);
288
289 return -1;
290 }
291
292 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
293 using the method implemented by duplicate_and_interleave. Return true
294 if so, returning the number of intermediate vectors in *NVECTORS_OUT
295 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
296 (if nonnull). */
297
298 bool
299 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
300 tree elt_type, unsigned int *nvectors_out,
301 tree *vector_type_out,
302 tree *permutes)
303 {
304 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
305 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
306 return false;
307
308 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
309 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
310 unsigned int nvectors = 1;
311 for (;;)
312 {
313 scalar_int_mode int_mode;
314 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
315 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
316 {
317 /* Get the natural vector type for this SLP group size. */
318 tree int_type = build_nonstandard_integer_type
319 (GET_MODE_BITSIZE (int_mode), 1);
320 tree vector_type
321 = get_vectype_for_scalar_type (vinfo, int_type, count);
322 if (vector_type
323 && VECTOR_MODE_P (TYPE_MODE (vector_type))
324 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
325 GET_MODE_SIZE (base_vector_mode)))
326 {
327 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
328 together into elements of type INT_TYPE and using the result
329 to build NVECTORS vectors. */
330 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
331 vec_perm_builder sel1 (nelts, 2, 3);
332 vec_perm_builder sel2 (nelts, 2, 3);
333 poly_int64 half_nelts = exact_div (nelts, 2);
334 for (unsigned int i = 0; i < 3; ++i)
335 {
336 sel1.quick_push (i);
337 sel1.quick_push (i + nelts);
338 sel2.quick_push (half_nelts + i);
339 sel2.quick_push (half_nelts + i + nelts);
340 }
341 vec_perm_indices indices1 (sel1, 2, nelts);
342 vec_perm_indices indices2 (sel2, 2, nelts);
343 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
344 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
345 {
346 if (nvectors_out)
347 *nvectors_out = nvectors;
348 if (vector_type_out)
349 *vector_type_out = vector_type;
350 if (permutes)
351 {
352 permutes[0] = vect_gen_perm_mask_checked (vector_type,
353 indices1);
354 permutes[1] = vect_gen_perm_mask_checked (vector_type,
355 indices2);
356 }
357 return true;
358 }
359 }
360 }
361 if (!multiple_p (elt_bytes, 2, &elt_bytes))
362 return false;
363 nvectors *= 2;
364 }
365 }
366
367 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
368 they are of a valid type and that they match the defs of the first stmt of
369 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
370 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
371 indicates swap is required for cond_expr stmts. Specifically, *SWAP
372 is 1 if STMT is cond and operands of comparison need to be swapped;
373 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
374 If there is any operand swap in this function, *SWAP is set to non-zero
375 value.
376 If there was a fatal error return -1; if the error could be corrected by
377 swapping operands of father node of this one, return 1; if everything is
378 ok return 0. */
379 static int
380 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
381 vec<stmt_vec_info> stmts, unsigned stmt_num,
382 vec<slp_oprnd_info> *oprnds_info)
383 {
384 stmt_vec_info stmt_info = stmts[stmt_num];
385 tree oprnd;
386 unsigned int i, number_of_oprnds;
387 enum vect_def_type dt = vect_uninitialized_def;
388 slp_oprnd_info oprnd_info;
389 int first_op_idx = 1;
390 unsigned int commutative_op = -1U;
391 bool first_op_cond = false;
392 bool first = stmt_num == 0;
393
394 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
395 {
396 number_of_oprnds = gimple_call_num_args (stmt);
397 first_op_idx = 3;
398 if (gimple_call_internal_p (stmt))
399 {
400 internal_fn ifn = gimple_call_internal_fn (stmt);
401 commutative_op = first_commutative_argument (ifn);
402
403 /* Masked load, only look at mask. */
404 if (ifn == IFN_MASK_LOAD)
405 {
406 number_of_oprnds = 1;
407 /* Mask operand index. */
408 first_op_idx = 5;
409 }
410 }
411 }
412 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
413 {
414 enum tree_code code = gimple_assign_rhs_code (stmt);
415 number_of_oprnds = gimple_num_ops (stmt) - 1;
416 /* Swap can only be done for cond_expr if asked to, otherwise we
417 could result in different comparison code to the first stmt. */
418 if (code == COND_EXPR
419 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
420 {
421 first_op_cond = true;
422 number_of_oprnds++;
423 }
424 else
425 commutative_op = commutative_tree_code (code) ? 0U : -1U;
426 }
427 else
428 return -1;
429
430 bool swapped = (*swap != 0);
431 gcc_assert (!swapped || first_op_cond);
432 for (i = 0; i < number_of_oprnds; i++)
433 {
434 again:
435 if (first_op_cond)
436 {
437 /* Map indicating how operands of cond_expr should be swapped. */
438 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
439 int *map = maps[*swap];
440
441 if (i < 2)
442 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
443 first_op_idx), map[i]);
444 else
445 oprnd = gimple_op (stmt_info->stmt, map[i]);
446 }
447 else
448 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
449 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
450 oprnd = TREE_OPERAND (oprnd, 0);
451
452 oprnd_info = (*oprnds_info)[i];
453
454 stmt_vec_info def_stmt_info;
455 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
456 {
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "Build SLP failed: can't analyze def for %T\n",
460 oprnd);
461
462 return -1;
463 }
464
465 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
466 oprnd_info->any_pattern = true;
467
468 if (first)
469 {
470 /* For the swapping logic below force vect_reduction_def
471 for the reduction op in a SLP reduction group. */
472 if (!STMT_VINFO_DATA_REF (stmt_info)
473 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
474 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
475 && def_stmt_info)
476 dt = vect_reduction_def;
477 oprnd_info->first_dt = dt;
478 oprnd_info->first_op_type = TREE_TYPE (oprnd);
479 }
480 else
481 {
482 /* Not first stmt of the group, check that the def-stmt/s match
483 the def-stmt/s of the first stmt. Allow different definition
484 types for reduction chains: the first stmt must be a
485 vect_reduction_def (a phi node), and the rest
486 end in the reduction chain. */
487 tree type = TREE_TYPE (oprnd);
488 if ((oprnd_info->first_dt != dt
489 && !(oprnd_info->first_dt == vect_reduction_def
490 && !STMT_VINFO_DATA_REF (stmt_info)
491 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
492 && def_stmt_info
493 && !STMT_VINFO_DATA_REF (def_stmt_info)
494 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
495 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
496 && !((oprnd_info->first_dt == vect_external_def
497 || oprnd_info->first_dt == vect_constant_def)
498 && (dt == vect_external_def
499 || dt == vect_constant_def)))
500 || !types_compatible_p (oprnd_info->first_op_type, type)
501 || (!STMT_VINFO_DATA_REF (stmt_info)
502 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
503 && ((!def_stmt_info
504 || STMT_VINFO_DATA_REF (def_stmt_info)
505 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
506 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
507 != (oprnd_info->first_dt != vect_reduction_def))))
508 {
509 /* Try swapping operands if we got a mismatch. */
510 if (i == commutative_op && !swapped)
511 {
512 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "trying swapped operands\n");
515 swapped = true;
516 goto again;
517 }
518
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "Build SLP failed: different types\n");
522
523 return 1;
524 }
525 if ((dt == vect_constant_def
526 || dt == vect_external_def)
527 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
528 && (TREE_CODE (type) == BOOLEAN_TYPE
529 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
530 type)))
531 {
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "Build SLP failed: invalid type of def "
535 "for variable-length SLP %T\n", oprnd);
536 return -1;
537 }
538 }
539
540 /* Check the types of the definitions. */
541 switch (dt)
542 {
543 case vect_external_def:
544 /* Make sure to demote the overall operand to external. */
545 oprnd_info->first_dt = vect_external_def;
546 /* Fallthru. */
547 case vect_constant_def:
548 oprnd_info->def_stmts.quick_push (NULL);
549 oprnd_info->ops.quick_push (oprnd);
550 break;
551
552 case vect_internal_def:
553 case vect_reduction_def:
554 if (oprnd_info->first_dt == vect_reduction_def
555 && !STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && !STMT_VINFO_DATA_REF (def_stmt_info)
558 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
559 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
560 {
561 /* For a SLP reduction chain we want to duplicate the
562 reduction to each of the chain members. That gets
563 us a sane SLP graph (still the stmts are not 100%
564 correct wrt the initial values). */
565 gcc_assert (!first);
566 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
567 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
568 break;
569 }
570 /* Fallthru. */
571 case vect_induction_def:
572 oprnd_info->def_stmts.quick_push (def_stmt_info);
573 oprnd_info->ops.quick_push (oprnd);
574 break;
575
576 default:
577 /* FORNOW: Not supported. */
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
580 "Build SLP failed: illegal type of def %T\n",
581 oprnd);
582
583 return -1;
584 }
585 }
586
587 /* Swap operands. */
588 if (swapped)
589 {
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location,
592 "swapped operands to match def types in %G",
593 stmt_info->stmt);
594 }
595
596 *swap = swapped;
597 return 0;
598 }
599
600 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
601 Return true if we can, meaning that this choice doesn't conflict with
602 existing SLP nodes that use STMT_INFO. */
603
604 static bool
605 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
606 {
607 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
608 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
609 return true;
610
611 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
612 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
613 {
614 /* We maintain the invariant that if any statement in the group is
615 used, all other members of the group have the same vector type. */
616 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
617 stmt_vec_info member_info = first_info;
618 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
619 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
620 || is_pattern_stmt_p (member_info))
621 break;
622
623 if (!member_info)
624 {
625 for (member_info = first_info; member_info;
626 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
627 STMT_VINFO_VECTYPE (member_info) = vectype;
628 return true;
629 }
630 }
631 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
632 && !is_pattern_stmt_p (stmt_info))
633 {
634 STMT_VINFO_VECTYPE (stmt_info) = vectype;
635 return true;
636 }
637
638 if (dump_enabled_p ())
639 {
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "Build SLP failed: incompatible vector"
642 " types for: %G", stmt_info->stmt);
643 dump_printf_loc (MSG_NOTE, vect_location,
644 " old vector type: %T\n", old_vectype);
645 dump_printf_loc (MSG_NOTE, vect_location,
646 " new vector type: %T\n", vectype);
647 }
648 return false;
649 }
650
651 /* Return true if call statements CALL1 and CALL2 are similar enough
652 to be combined into the same SLP group. */
653
654 static bool
655 compatible_calls_p (gcall *call1, gcall *call2)
656 {
657 unsigned int nargs = gimple_call_num_args (call1);
658 if (nargs != gimple_call_num_args (call2))
659 return false;
660
661 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
662 return false;
663
664 if (gimple_call_internal_p (call1))
665 {
666 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
667 TREE_TYPE (gimple_call_lhs (call2))))
668 return false;
669 for (unsigned int i = 0; i < nargs; ++i)
670 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
671 TREE_TYPE (gimple_call_arg (call2, i))))
672 return false;
673 }
674 else
675 {
676 if (!operand_equal_p (gimple_call_fn (call1),
677 gimple_call_fn (call2), 0))
678 return false;
679
680 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
681 return false;
682 }
683 return true;
684 }
685
686 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
687 caller's attempt to find the vector type in STMT_INFO with the narrowest
688 element type. Return true if VECTYPE is nonnull and if it is valid
689 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
690 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
691 vect_build_slp_tree. */
692
693 static bool
694 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
695 unsigned int group_size,
696 tree vectype, poly_uint64 *max_nunits)
697 {
698 if (!vectype)
699 {
700 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: unsupported data-type in %G\n",
703 stmt_info->stmt);
704 /* Fatal mismatch. */
705 return false;
706 }
707
708 /* If populating the vector type requires unrolling then fail
709 before adjusting *max_nunits for basic-block vectorization. */
710 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
711 unsigned HOST_WIDE_INT const_nunits;
712 if (is_a <bb_vec_info> (vinfo)
713 && (!nunits.is_constant (&const_nunits)
714 || const_nunits > group_size))
715 {
716 if (dump_enabled_p ())
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "Build SLP failed: unrolling required "
719 "in basic block SLP\n");
720 /* Fatal mismatch. */
721 return false;
722 }
723
724 /* In case of multiple types we need to detect the smallest type. */
725 vect_update_max_nunits (max_nunits, vectype);
726 return true;
727 }
728
729 /* Verify if the scalar stmts STMTS are isomorphic, require data
730 permutation or are of unsupported types of operation. Return
731 true if they are, otherwise return false and indicate in *MATCHES
732 which stmts are not isomorphic to the first one. If MATCHES[0]
733 is false then this indicates the comparison could not be
734 carried out or the stmts will never be vectorized by SLP.
735
736 Note COND_EXPR is possibly isomorphic to another one after swapping its
737 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
738 the first stmt by swapping the two operands of comparison; set SWAP[i]
739 to 2 if stmt I is isormorphic to the first stmt by inverting the code
740 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
741 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
742
743 static bool
744 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
745 vec<stmt_vec_info> stmts, unsigned int group_size,
746 poly_uint64 *max_nunits, bool *matches,
747 bool *two_operators, tree *node_vectype)
748 {
749 unsigned int i;
750 stmt_vec_info first_stmt_info = stmts[0];
751 enum tree_code first_stmt_code = ERROR_MARK;
752 enum tree_code alt_stmt_code = ERROR_MARK;
753 enum tree_code rhs_code = ERROR_MARK;
754 enum tree_code first_cond_code = ERROR_MARK;
755 tree lhs;
756 bool need_same_oprnds = false;
757 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
758 optab optab;
759 int icode;
760 machine_mode optab_op2_mode;
761 machine_mode vec_mode;
762 stmt_vec_info first_load = NULL, prev_first_load = NULL;
763 bool load_p = false;
764
765 /* For every stmt in NODE find its def stmt/s. */
766 stmt_vec_info stmt_info;
767 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
768 {
769 gimple *stmt = stmt_info->stmt;
770 swap[i] = 0;
771 matches[i] = false;
772
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
775
776 /* Fail to vectorize statements marked as unvectorizable. */
777 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
778 {
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: unvectorizable statement %G",
782 stmt);
783 /* Fatal mismatch. */
784 matches[0] = false;
785 return false;
786 }
787
788 lhs = gimple_get_lhs (stmt);
789 if (lhs == NULL_TREE)
790 {
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: not GIMPLE_ASSIGN nor "
794 "GIMPLE_CALL %G", stmt);
795 /* Fatal mismatch. */
796 matches[0] = false;
797 return false;
798 }
799
800 tree nunits_vectype;
801 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
802 &nunits_vectype, group_size)
803 || (nunits_vectype
804 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
805 nunits_vectype, max_nunits)))
806 {
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
810 }
811
812 gcc_assert (vectype);
813
814 if (is_a <bb_vec_info> (vinfo)
815 && !vect_update_shared_vectype (stmt_info, vectype))
816 continue;
817
818 gcall *call_stmt = dyn_cast <gcall *> (stmt);
819 if (call_stmt)
820 {
821 rhs_code = CALL_EXPR;
822
823 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
824 load_p = true;
825 else if ((gimple_call_internal_p (call_stmt)
826 && (!vectorizable_internal_fn_p
827 (gimple_call_internal_fn (call_stmt))))
828 || gimple_call_tail_p (call_stmt)
829 || gimple_call_noreturn_p (call_stmt)
830 || !gimple_call_nothrow_p (call_stmt)
831 || gimple_call_chain (call_stmt))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unsupported call type %G",
836 call_stmt);
837 /* Fatal mismatch. */
838 matches[0] = false;
839 return false;
840 }
841 }
842 else
843 {
844 rhs_code = gimple_assign_rhs_code (stmt);
845 load_p = gimple_vuse (stmt);
846 }
847
848 /* Check the operation. */
849 if (i == 0)
850 {
851 *node_vectype = vectype;
852 first_stmt_code = rhs_code;
853
854 /* Shift arguments should be equal in all the packed stmts for a
855 vector shift with scalar shift operand. */
856 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
857 || rhs_code == LROTATE_EXPR
858 || rhs_code == RROTATE_EXPR)
859 {
860 vec_mode = TYPE_MODE (vectype);
861
862 /* First see if we have a vector/vector shift. */
863 optab = optab_for_tree_code (rhs_code, vectype,
864 optab_vector);
865
866 if (!optab
867 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
868 {
869 /* No vector/vector shift, try for a vector/scalar shift. */
870 optab = optab_for_tree_code (rhs_code, vectype,
871 optab_scalar);
872
873 if (!optab)
874 {
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
877 "Build SLP failed: no optab.\n");
878 /* Fatal mismatch. */
879 matches[0] = false;
880 return false;
881 }
882 icode = (int) optab_handler (optab, vec_mode);
883 if (icode == CODE_FOR_nothing)
884 {
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
887 "Build SLP failed: "
888 "op not supported by target.\n");
889 /* Fatal mismatch. */
890 matches[0] = false;
891 return false;
892 }
893 optab_op2_mode = insn_data[icode].operand[2].mode;
894 if (!VECTOR_MODE_P (optab_op2_mode))
895 {
896 need_same_oprnds = true;
897 first_op1 = gimple_assign_rhs2 (stmt);
898 }
899 }
900 }
901 else if (rhs_code == WIDEN_LSHIFT_EXPR)
902 {
903 need_same_oprnds = true;
904 first_op1 = gimple_assign_rhs2 (stmt);
905 }
906 else if (!load_p
907 && rhs_code == BIT_FIELD_REF)
908 {
909 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
910 if (TREE_CODE (vec) != SSA_NAME
911 || !types_compatible_p (vectype, TREE_TYPE (vec)))
912 {
913 if (dump_enabled_p ())
914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 "Build SLP failed: "
916 "BIT_FIELD_REF not supported\n");
917 /* Fatal mismatch. */
918 matches[0] = false;
919 return false;
920 }
921 }
922 else if (call_stmt
923 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
924 {
925 need_same_oprnds = true;
926 first_op1 = gimple_call_arg (call_stmt, 1);
927 }
928 }
929 else
930 {
931 if (first_stmt_code != rhs_code
932 && alt_stmt_code == ERROR_MARK)
933 alt_stmt_code = rhs_code;
934 if (first_stmt_code != rhs_code
935 && (first_stmt_code != IMAGPART_EXPR
936 || rhs_code != REALPART_EXPR)
937 && (first_stmt_code != REALPART_EXPR
938 || rhs_code != IMAGPART_EXPR)
939 /* Handle mismatches in plus/minus by computing both
940 and merging the results. */
941 && !((first_stmt_code == PLUS_EXPR
942 || first_stmt_code == MINUS_EXPR)
943 && (alt_stmt_code == PLUS_EXPR
944 || alt_stmt_code == MINUS_EXPR)
945 && rhs_code == alt_stmt_code)
946 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
947 && (first_stmt_code == ARRAY_REF
948 || first_stmt_code == BIT_FIELD_REF
949 || first_stmt_code == INDIRECT_REF
950 || first_stmt_code == COMPONENT_REF
951 || first_stmt_code == MEM_REF)))
952 {
953 if (dump_enabled_p ())
954 {
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different operation "
957 "in stmt %G", stmt);
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "original stmt %G", first_stmt_info->stmt);
960 }
961 /* Mismatch. */
962 continue;
963 }
964
965 if (need_same_oprnds)
966 {
967 tree other_op1 = (call_stmt
968 ? gimple_call_arg (call_stmt, 1)
969 : gimple_assign_rhs2 (stmt));
970 if (!operand_equal_p (first_op1, other_op1, 0))
971 {
972 if (dump_enabled_p ())
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "Build SLP failed: different shift "
975 "arguments in %G", stmt);
976 /* Mismatch. */
977 continue;
978 }
979 }
980 if (!load_p
981 && first_stmt_code == BIT_FIELD_REF
982 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
983 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
984 {
985 if (dump_enabled_p ())
986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 "Build SLP failed: different BIT_FIELD_REF "
988 "arguments in %G", stmt);
989 /* Mismatch. */
990 continue;
991 }
992
993 if (!load_p && rhs_code == CALL_EXPR)
994 {
995 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
996 as_a <gcall *> (stmt)))
997 {
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "Build SLP failed: different calls in %G",
1001 stmt);
1002 /* Mismatch. */
1003 continue;
1004 }
1005 }
1006 }
1007
1008 /* Grouped store or load. */
1009 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1010 {
1011 if (REFERENCE_CLASS_P (lhs))
1012 {
1013 /* Store. */
1014 ;
1015 }
1016 else
1017 {
1018 /* Load. */
1019 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1020 if (prev_first_load)
1021 {
1022 /* Check that there are no loads from different interleaving
1023 chains in the same node. */
1024 if (prev_first_load != first_load)
1025 {
1026 if (dump_enabled_p ())
1027 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1028 vect_location,
1029 "Build SLP failed: different "
1030 "interleaving chains in one node %G",
1031 stmt);
1032 /* Mismatch. */
1033 continue;
1034 }
1035 }
1036 else
1037 prev_first_load = first_load;
1038 }
1039 } /* Grouped access. */
1040 else
1041 {
1042 if (load_p)
1043 {
1044 /* Not grouped load. */
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047 "Build SLP failed: not grouped load %G", stmt);
1048
1049 /* FORNOW: Not grouped loads are not supported. */
1050 /* Fatal mismatch. */
1051 matches[0] = false;
1052 return false;
1053 }
1054
1055 /* Not memory operation. */
1056 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1057 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1058 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1059 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1060 && rhs_code != VIEW_CONVERT_EXPR
1061 && rhs_code != CALL_EXPR
1062 && rhs_code != BIT_FIELD_REF)
1063 {
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Build SLP failed: operation unsupported %G",
1067 stmt);
1068 /* Fatal mismatch. */
1069 matches[0] = false;
1070 return false;
1071 }
1072
1073 if (rhs_code == COND_EXPR)
1074 {
1075 tree cond_expr = gimple_assign_rhs1 (stmt);
1076 enum tree_code cond_code = TREE_CODE (cond_expr);
1077 enum tree_code swap_code = ERROR_MARK;
1078 enum tree_code invert_code = ERROR_MARK;
1079
1080 if (i == 0)
1081 first_cond_code = TREE_CODE (cond_expr);
1082 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1083 {
1084 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1085 swap_code = swap_tree_comparison (cond_code);
1086 invert_code = invert_tree_comparison (cond_code, honor_nans);
1087 }
1088
1089 if (first_cond_code == cond_code)
1090 ;
1091 /* Isomorphic can be achieved by swapping. */
1092 else if (first_cond_code == swap_code)
1093 swap[i] = 1;
1094 /* Isomorphic can be achieved by inverting. */
1095 else if (first_cond_code == invert_code)
1096 swap[i] = 2;
1097 else
1098 {
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "Build SLP failed: different"
1102 " operation %G", stmt);
1103 /* Mismatch. */
1104 continue;
1105 }
1106 }
1107 }
1108
1109 matches[i] = true;
1110 }
1111
1112 for (i = 0; i < group_size; ++i)
1113 if (!matches[i])
1114 return false;
1115
1116 /* If we allowed a two-operation SLP node verify the target can cope
1117 with the permute we are going to use. */
1118 if (alt_stmt_code != ERROR_MARK
1119 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1120 {
1121 *two_operators = true;
1122 }
1123
1124 return true;
1125 }
1126
1127 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1128 Note we never remove apart from at destruction time so we do not
1129 need a special value for deleted that differs from empty. */
1130 struct bst_traits
1131 {
1132 typedef vec <stmt_vec_info> value_type;
1133 typedef vec <stmt_vec_info> compare_type;
1134 static inline hashval_t hash (value_type);
1135 static inline bool equal (value_type existing, value_type candidate);
1136 static inline bool is_empty (value_type x) { return !x.exists (); }
1137 static inline bool is_deleted (value_type x) { return !x.exists (); }
1138 static const bool empty_zero_p = true;
1139 static inline void mark_empty (value_type &x) { x.release (); }
1140 static inline void mark_deleted (value_type &x) { x.release (); }
1141 static inline void remove (value_type &x) { x.release (); }
1142 };
1143 inline hashval_t
1144 bst_traits::hash (value_type x)
1145 {
1146 inchash::hash h;
1147 for (unsigned i = 0; i < x.length (); ++i)
1148 h.add_int (gimple_uid (x[i]->stmt));
1149 return h.end ();
1150 }
1151 inline bool
1152 bst_traits::equal (value_type existing, value_type candidate)
1153 {
1154 if (existing.length () != candidate.length ())
1155 return false;
1156 for (unsigned i = 0; i < existing.length (); ++i)
1157 if (existing[i] != candidate[i])
1158 return false;
1159 return true;
1160 }
1161
1162 typedef hash_map <vec <gimple *>, slp_tree,
1163 simple_hashmap_traits <bst_traits, slp_tree> >
1164 scalar_stmts_to_slp_tree_map_t;
1165
1166 static slp_tree
1167 vect_build_slp_tree_2 (vec_info *vinfo,
1168 vec<stmt_vec_info> stmts, unsigned int group_size,
1169 poly_uint64 *max_nunits,
1170 bool *matches, unsigned *npermutes, unsigned *tree_size,
1171 scalar_stmts_to_slp_tree_map_t *bst_map);
1172
1173 static slp_tree
1174 vect_build_slp_tree (vec_info *vinfo,
1175 vec<stmt_vec_info> stmts, unsigned int group_size,
1176 poly_uint64 *max_nunits,
1177 bool *matches, unsigned *npermutes, unsigned *tree_size,
1178 scalar_stmts_to_slp_tree_map_t *bst_map)
1179 {
1180 if (slp_tree *leader = bst_map->get (stmts))
1181 {
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1184 *leader ? "" : "failed ", *leader);
1185 if (*leader)
1186 {
1187 (*leader)->refcnt++;
1188 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1189 }
1190 return *leader;
1191 }
1192 poly_uint64 this_max_nunits = 1;
1193 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1194 &this_max_nunits,
1195 matches, npermutes, tree_size, bst_map);
1196 if (res)
1197 {
1198 res->max_nunits = this_max_nunits;
1199 vect_update_max_nunits (max_nunits, this_max_nunits);
1200 /* Keep a reference for the bst_map use. */
1201 res->refcnt++;
1202 }
1203 bst_map->put (stmts.copy (), res);
1204 return res;
1205 }
1206
1207 /* Recursively build an SLP tree starting from NODE.
1208 Fail (and return a value not equal to zero) if def-stmts are not
1209 isomorphic, require data permutation or are of unsupported types of
1210 operation. Otherwise, return 0.
1211 The value returned is the depth in the SLP tree where a mismatch
1212 was found. */
1213
1214 static slp_tree
1215 vect_build_slp_tree_2 (vec_info *vinfo,
1216 vec<stmt_vec_info> stmts, unsigned int group_size,
1217 poly_uint64 *max_nunits,
1218 bool *matches, unsigned *npermutes, unsigned *tree_size,
1219 scalar_stmts_to_slp_tree_map_t *bst_map)
1220 {
1221 unsigned nops, i, this_tree_size = 0;
1222 poly_uint64 this_max_nunits = *max_nunits;
1223 slp_tree node;
1224
1225 matches[0] = false;
1226
1227 stmt_vec_info stmt_info = stmts[0];
1228 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1229 nops = gimple_call_num_args (stmt);
1230 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1231 {
1232 nops = gimple_num_ops (stmt) - 1;
1233 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1234 nops++;
1235 }
1236 else if (is_a <gphi *> (stmt_info->stmt))
1237 nops = 0;
1238 else
1239 return NULL;
1240
1241 /* If the SLP node is a PHI (induction or reduction), terminate
1242 the recursion. */
1243 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1244 {
1245 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1246 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1247 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1248 max_nunits))
1249 return NULL;
1250
1251 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1252 /* Induction from different IVs is not supported. */
1253 if (def_type == vect_induction_def)
1254 {
1255 stmt_vec_info other_info;
1256 FOR_EACH_VEC_ELT (stmts, i, other_info)
1257 if (stmt_info != other_info)
1258 return NULL;
1259 }
1260 else if (def_type == vect_reduction_def
1261 || def_type == vect_double_reduction_def
1262 || def_type == vect_nested_cycle)
1263 {
1264 /* Else def types have to match. */
1265 stmt_vec_info other_info;
1266 FOR_EACH_VEC_ELT (stmts, i, other_info)
1267 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1268 return NULL;
1269 }
1270 else
1271 return NULL;
1272 (*tree_size)++;
1273 node = vect_create_new_slp_node (stmts, 0);
1274 SLP_TREE_VECTYPE (node) = vectype;
1275 return node;
1276 }
1277
1278
1279 bool two_operators = false;
1280 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1281 tree vectype = NULL_TREE;
1282 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1283 &this_max_nunits, matches, &two_operators,
1284 &vectype))
1285 return NULL;
1286
1287 /* If the SLP node is a load, terminate the recursion unless masked. */
1288 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1290 {
1291 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1292 {
1293 /* Masked load. */
1294 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1295 nops = 1;
1296 }
1297 else
1298 {
1299 *max_nunits = this_max_nunits;
1300 (*tree_size)++;
1301 node = vect_create_new_slp_node (stmts, 0);
1302 SLP_TREE_VECTYPE (node) = vectype;
1303 /* And compute the load permutation. Whether it is actually
1304 a permutation depends on the unrolling factor which is
1305 decided later. */
1306 vec<unsigned> load_permutation;
1307 int j;
1308 stmt_vec_info load_info;
1309 load_permutation.create (group_size);
1310 stmt_vec_info first_stmt_info
1311 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1312 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1313 {
1314 int load_place = vect_get_place_in_interleaving_chain
1315 (load_info, first_stmt_info);
1316 gcc_assert (load_place != -1);
1317 load_permutation.safe_push (load_place);
1318 }
1319 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1320 return node;
1321 }
1322 }
1323 else if (gimple_assign_single_p (stmt_info->stmt)
1324 && !gimple_vuse (stmt_info->stmt)
1325 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1326 {
1327 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1328 the same SSA name vector of a compatible type to vectype. */
1329 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1330 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1331 stmt_vec_info estmt_info;
1332 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1333 {
1334 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1335 tree bfref = gimple_assign_rhs1 (estmt);
1336 HOST_WIDE_INT lane;
1337 if (!known_eq (bit_field_size (bfref),
1338 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1339 || !constant_multiple_p (bit_field_offset (bfref),
1340 bit_field_size (bfref), &lane))
1341 {
1342 lperm.release ();
1343 return NULL;
1344 }
1345 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1346 }
1347 slp_tree vnode = vect_create_new_slp_node (vNULL);
1348 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1349 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1350 /* We are always building a permutation node even if it is an identity
1351 permute to shield the rest of the vectorizer from the odd node
1352 representing an actual vector without any scalar ops.
1353 ??? We could hide it completely with making the permute node
1354 external? */
1355 node = vect_create_new_slp_node (stmts, 1);
1356 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1357 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1358 SLP_TREE_VECTYPE (node) = vectype;
1359 SLP_TREE_CHILDREN (node).quick_push (vnode);
1360 return node;
1361 }
1362
1363 /* Get at the operands, verifying they are compatible. */
1364 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1365 slp_oprnd_info oprnd_info;
1366 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1367 {
1368 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1369 stmts, i, &oprnds_info);
1370 if (res != 0)
1371 matches[(res == -1) ? 0 : i] = false;
1372 if (!matches[0])
1373 break;
1374 }
1375 for (i = 0; i < group_size; ++i)
1376 if (!matches[i])
1377 {
1378 vect_free_oprnd_info (oprnds_info);
1379 return NULL;
1380 }
1381
1382 auto_vec<slp_tree, 4> children;
1383
1384 stmt_info = stmts[0];
1385
1386 /* Create SLP_TREE nodes for the definition node/s. */
1387 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1388 {
1389 slp_tree child;
1390 unsigned int j;
1391
1392 if (oprnd_info->first_dt == vect_uninitialized_def)
1393 {
1394 /* COND_EXPR have one too many eventually if the condition
1395 is a SSA name. */
1396 gcc_assert (i == 3 && nops == 4);
1397 continue;
1398 }
1399
1400 if (oprnd_info->first_dt != vect_internal_def
1401 && oprnd_info->first_dt != vect_reduction_def
1402 && oprnd_info->first_dt != vect_induction_def)
1403 {
1404 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1405 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1406 oprnd_info->ops = vNULL;
1407 children.safe_push (invnode);
1408 continue;
1409 }
1410
1411 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1412 group_size, &this_max_nunits,
1413 matches, npermutes,
1414 &this_tree_size, bst_map)) != NULL)
1415 {
1416 oprnd_info->def_stmts = vNULL;
1417 children.safe_push (child);
1418 continue;
1419 }
1420
1421 /* If the SLP build failed fatally and we analyze a basic-block
1422 simply treat nodes we fail to build as externally defined
1423 (and thus build vectors from the scalar defs).
1424 The cost model will reject outright expensive cases.
1425 ??? This doesn't treat cases where permutation ultimatively
1426 fails (or we don't try permutation below). Ideally we'd
1427 even compute a permutation that will end up with the maximum
1428 SLP tree size... */
1429 if (is_a <bb_vec_info> (vinfo)
1430 && !matches[0]
1431 /* ??? Rejecting patterns this way doesn't work. We'd have to
1432 do extra work to cancel the pattern so the uses see the
1433 scalar version. */
1434 && !is_pattern_stmt_p (stmt_info)
1435 && !oprnd_info->any_pattern)
1436 {
1437 if (dump_enabled_p ())
1438 dump_printf_loc (MSG_NOTE, vect_location,
1439 "Building vector operands from scalars\n");
1440 this_tree_size++;
1441 child = vect_create_new_slp_node (oprnd_info->ops);
1442 children.safe_push (child);
1443 oprnd_info->ops = vNULL;
1444 oprnd_info->def_stmts = vNULL;
1445 continue;
1446 }
1447
1448 /* If the SLP build for operand zero failed and operand zero
1449 and one can be commutated try that for the scalar stmts
1450 that failed the match. */
1451 if (i == 0
1452 /* A first scalar stmt mismatch signals a fatal mismatch. */
1453 && matches[0]
1454 /* ??? For COND_EXPRs we can swap the comparison operands
1455 as well as the arms under some constraints. */
1456 && nops == 2
1457 && oprnds_info[1]->first_dt == vect_internal_def
1458 && is_gimple_assign (stmt_info->stmt)
1459 /* Swapping operands for reductions breaks assumptions later on. */
1460 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1461 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1462 /* Do so only if the number of not successful permutes was nor more
1463 than a cut-ff as re-trying the recursive match on
1464 possibly each level of the tree would expose exponential
1465 behavior. */
1466 && *npermutes < 4)
1467 {
1468 /* See whether we can swap the matching or the non-matching
1469 stmt operands. */
1470 bool swap_not_matching = true;
1471 do
1472 {
1473 for (j = 0; j < group_size; ++j)
1474 {
1475 if (matches[j] != !swap_not_matching)
1476 continue;
1477 stmt_vec_info stmt_info = stmts[j];
1478 /* Verify if we can swap operands of this stmt. */
1479 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1480 if (!stmt
1481 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1482 {
1483 if (!swap_not_matching)
1484 goto fail;
1485 swap_not_matching = false;
1486 break;
1487 }
1488 }
1489 }
1490 while (j != group_size);
1491
1492 /* Swap mismatched definition stmts. */
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Re-trying with swapped operands of stmts ");
1496 for (j = 0; j < group_size; ++j)
1497 if (matches[j] == !swap_not_matching)
1498 {
1499 std::swap (oprnds_info[0]->def_stmts[j],
1500 oprnds_info[1]->def_stmts[j]);
1501 std::swap (oprnds_info[0]->ops[j],
1502 oprnds_info[1]->ops[j]);
1503 if (dump_enabled_p ())
1504 dump_printf (MSG_NOTE, "%d ", j);
1505 }
1506 if (dump_enabled_p ())
1507 dump_printf (MSG_NOTE, "\n");
1508 /* And try again with scratch 'matches' ... */
1509 bool *tem = XALLOCAVEC (bool, group_size);
1510 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1511 group_size, &this_max_nunits,
1512 tem, npermutes,
1513 &this_tree_size, bst_map)) != NULL)
1514 {
1515 oprnd_info->def_stmts = vNULL;
1516 children.safe_push (child);
1517 continue;
1518 }
1519
1520 ++*npermutes;
1521 }
1522
1523 fail:
1524 gcc_assert (child == NULL);
1525 FOR_EACH_VEC_ELT (children, j, child)
1526 vect_free_slp_tree (child, false);
1527 vect_free_oprnd_info (oprnds_info);
1528 return NULL;
1529 }
1530
1531 vect_free_oprnd_info (oprnds_info);
1532
1533 /* If we have all children of a child built up from uniform scalars
1534 then just throw that away, causing it built up from scalars.
1535 The exception is the SLP node for the vector store. */
1536 if (is_a <bb_vec_info> (vinfo)
1537 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1538 /* ??? Rejecting patterns this way doesn't work. We'd have to
1539 do extra work to cancel the pattern so the uses see the
1540 scalar version. */
1541 && !is_pattern_stmt_p (stmt_info))
1542 {
1543 slp_tree child;
1544 unsigned j;
1545 FOR_EACH_VEC_ELT (children, j, child)
1546 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1547 || !vect_slp_tree_uniform_p (child))
1548 break;
1549 if (!child)
1550 {
1551 /* Roll back. */
1552 matches[0] = false;
1553 FOR_EACH_VEC_ELT (children, j, child)
1554 vect_free_slp_tree (child, false);
1555
1556 if (dump_enabled_p ())
1557 dump_printf_loc (MSG_NOTE, vect_location,
1558 "Building parent vector operands from "
1559 "scalars instead\n");
1560 return NULL;
1561 }
1562 }
1563
1564 *tree_size += this_tree_size + 1;
1565 *max_nunits = this_max_nunits;
1566
1567 if (two_operators)
1568 {
1569 /* ??? We'd likely want to either cache in bst_map sth like
1570 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1571 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1572 explicit stmts to put in so the keying on 'stmts' doesn't
1573 work (but we have the same issue with nodes that use 'ops'). */
1574 slp_tree one = new _slp_tree;
1575 slp_tree two = new _slp_tree;
1576 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1577 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1578 SLP_TREE_VECTYPE (one) = vectype;
1579 SLP_TREE_VECTYPE (two) = vectype;
1580 SLP_TREE_CHILDREN (one).safe_splice (children);
1581 SLP_TREE_CHILDREN (two).safe_splice (children);
1582 slp_tree child;
1583 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1584 child->refcnt++;
1585
1586 /* Here we record the original defs since this
1587 node represents the final lane configuration. */
1588 node = vect_create_new_slp_node (stmts, 2);
1589 SLP_TREE_VECTYPE (node) = vectype;
1590 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1591 SLP_TREE_CHILDREN (node).quick_push (one);
1592 SLP_TREE_CHILDREN (node).quick_push (two);
1593 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1594 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1595 enum tree_code ocode = ERROR_MARK;
1596 stmt_vec_info ostmt_info;
1597 unsigned j = 0;
1598 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1599 {
1600 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1601 if (gimple_assign_rhs_code (ostmt) != code0)
1602 {
1603 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1604 ocode = gimple_assign_rhs_code (ostmt);
1605 j = i;
1606 }
1607 else
1608 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1609 }
1610 SLP_TREE_CODE (one) = code0;
1611 SLP_TREE_CODE (two) = ocode;
1612 SLP_TREE_LANES (one) = stmts.length ();
1613 SLP_TREE_LANES (two) = stmts.length ();
1614 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1615 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1616 return node;
1617 }
1618
1619 node = vect_create_new_slp_node (stmts, nops);
1620 SLP_TREE_VECTYPE (node) = vectype;
1621 SLP_TREE_CHILDREN (node).splice (children);
1622 return node;
1623 }
1624
1625 /* Dump a single SLP tree NODE. */
1626
1627 static void
1628 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1629 slp_tree node)
1630 {
1631 unsigned i, j;
1632 slp_tree child;
1633 stmt_vec_info stmt_info;
1634 tree op;
1635
1636 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1637 dump_user_location_t user_loc = loc.get_user_location ();
1638 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1639 SLP_TREE_DEF_TYPE (node) == vect_external_def
1640 ? " (external)"
1641 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1642 ? " (constant)"
1643 : ""), node,
1644 estimated_poly_value (node->max_nunits), node->refcnt);
1645 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1646 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1647 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1648 else
1649 {
1650 dump_printf_loc (metadata, user_loc, "\t{ ");
1651 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1652 dump_printf (metadata, "%T%s ", op,
1653 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1654 dump_printf (metadata, "}\n");
1655 }
1656 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1657 {
1658 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1659 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1660 dump_printf (dump_kind, " %u", j);
1661 dump_printf (dump_kind, " }\n");
1662 }
1663 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1664 {
1665 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1666 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1667 dump_printf (dump_kind, " %u[%u]",
1668 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1669 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1670 dump_printf (dump_kind, " }\n");
1671 }
1672 if (SLP_TREE_CHILDREN (node).is_empty ())
1673 return;
1674 dump_printf_loc (metadata, user_loc, "\tchildren");
1675 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1676 dump_printf (dump_kind, " %p", (void *)child);
1677 dump_printf (dump_kind, "\n");
1678 }
1679
1680 DEBUG_FUNCTION void
1681 debug (slp_tree node)
1682 {
1683 debug_dump_context ctx;
1684 vect_print_slp_tree (MSG_NOTE,
1685 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1686 node);
1687 }
1688
1689 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1690
1691 static void
1692 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1693 slp_tree node, hash_set<slp_tree> &visited)
1694 {
1695 unsigned i;
1696 slp_tree child;
1697
1698 if (visited.add (node))
1699 return;
1700
1701 vect_print_slp_tree (dump_kind, loc, node);
1702
1703 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1704 vect_print_slp_graph (dump_kind, loc, child, visited);
1705 }
1706
1707 static void
1708 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1709 slp_tree entry)
1710 {
1711 hash_set<slp_tree> visited;
1712 vect_print_slp_graph (dump_kind, loc, entry, visited);
1713 }
1714
1715 /* Mark the tree rooted at NODE with PURE_SLP. */
1716
1717 static void
1718 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1719 {
1720 int i;
1721 stmt_vec_info stmt_info;
1722 slp_tree child;
1723
1724 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1725 return;
1726
1727 if (visited.add (node))
1728 return;
1729
1730 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1731 STMT_SLP_TYPE (stmt_info) = pure_slp;
1732
1733 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1734 vect_mark_slp_stmts (child, visited);
1735 }
1736
1737 static void
1738 vect_mark_slp_stmts (slp_tree node)
1739 {
1740 hash_set<slp_tree> visited;
1741 vect_mark_slp_stmts (node, visited);
1742 }
1743
1744 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1745
1746 static void
1747 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1748 {
1749 int i;
1750 stmt_vec_info stmt_info;
1751 slp_tree child;
1752
1753 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1754 return;
1755
1756 if (visited.add (node))
1757 return;
1758
1759 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1760 {
1761 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1762 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1763 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1764 }
1765
1766 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1767 vect_mark_slp_stmts_relevant (child, visited);
1768 }
1769
1770 static void
1771 vect_mark_slp_stmts_relevant (slp_tree node)
1772 {
1773 hash_set<slp_tree> visited;
1774 vect_mark_slp_stmts_relevant (node, visited);
1775 }
1776
1777 /* Copy the SLP subtree rooted at NODE. */
1778
1779 static slp_tree
1780 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1781 {
1782 unsigned i;
1783
1784 bool existed_p;
1785 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1786 if (existed_p)
1787 return copy_ref;
1788
1789 copy_ref = new _slp_tree;
1790 slp_tree copy = copy_ref;
1791 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1792 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1793 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1794 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1795 copy->max_nunits = node->max_nunits;
1796 copy->refcnt = 0;
1797 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1798 {
1799 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1800 stmt_vec_info stmt_info;
1801 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1802 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1803 }
1804 if (SLP_TREE_SCALAR_OPS (node).exists ())
1805 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1806 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1807 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1808 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1809 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1810 if (SLP_TREE_CHILDREN (node).exists ())
1811 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1812 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1813
1814 slp_tree child;
1815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1816 {
1817 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1818 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1819 }
1820 return copy;
1821 }
1822
1823 /* Rearrange the statements of NODE according to PERMUTATION. */
1824
1825 static void
1826 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1827 vec<unsigned> permutation,
1828 hash_set<slp_tree> &visited)
1829 {
1830 unsigned int i;
1831 slp_tree child;
1832
1833 if (visited.add (node))
1834 return;
1835
1836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1837 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1838
1839 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1840 {
1841 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1842 vec<stmt_vec_info> tmp_stmts;
1843 tmp_stmts.create (group_size);
1844 tmp_stmts.quick_grow (group_size);
1845 stmt_vec_info stmt_info;
1846 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1847 tmp_stmts[permutation[i]] = stmt_info;
1848 SLP_TREE_SCALAR_STMTS (node).release ();
1849 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1850 }
1851 if (SLP_TREE_SCALAR_OPS (node).exists ())
1852 {
1853 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1854 vec<tree> tmp_ops;
1855 tmp_ops.create (group_size);
1856 tmp_ops.quick_grow (group_size);
1857 tree op;
1858 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1859 tmp_ops[permutation[i]] = op;
1860 SLP_TREE_SCALAR_OPS (node).release ();
1861 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1862 }
1863 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1864 {
1865 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1866 for (i = 0; i < group_size; ++i)
1867 SLP_TREE_LANE_PERMUTATION (node)[i].second
1868 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1869 }
1870 }
1871
1872
1873 /* Attempt to reorder stmts in a reduction chain so that we don't
1874 require any load permutation. Return true if that was possible,
1875 otherwise return false. */
1876
1877 static bool
1878 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1879 {
1880 unsigned int i, j;
1881 unsigned int lidx;
1882 slp_tree node, load;
1883
1884 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1885 return false;
1886
1887 /* Compare all the permutation sequences to the first one. We know
1888 that at least one load is permuted. */
1889 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1890 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1891 return false;
1892 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1893 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1894 {
1895 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1896 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1897 return false;
1898 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1899 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1900 return false;
1901 }
1902
1903 /* Check that the loads in the first sequence are different and there
1904 are no gaps between them. */
1905 auto_sbitmap load_index (group_size);
1906 bitmap_clear (load_index);
1907 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1908 {
1909 if (lidx >= group_size)
1910 return false;
1911 if (bitmap_bit_p (load_index, lidx))
1912 return false;
1913
1914 bitmap_set_bit (load_index, lidx);
1915 }
1916 for (i = 0; i < group_size; i++)
1917 if (!bitmap_bit_p (load_index, i))
1918 return false;
1919
1920 /* This permutation is valid for reduction. Since the order of the
1921 statements in the nodes is not important unless they are memory
1922 accesses, we can rearrange the statements in all the nodes
1923 according to the order of the loads. */
1924
1925 /* We have to unshare the SLP tree we modify. */
1926 hash_map<slp_tree, slp_tree> map;
1927 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1928 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1929 unshared->refcnt++;
1930 SLP_INSTANCE_TREE (slp_instn) = unshared;
1931 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1932 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1933 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1934
1935 /* Do the actual re-arrangement. */
1936 hash_set<slp_tree> visited;
1937 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1938 node->load_permutation, visited);
1939
1940 /* We are done, no actual permutations need to be generated. */
1941 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1942 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1943 {
1944 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1945 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1946 /* But we have to keep those permutations that are required because
1947 of handling of gaps. */
1948 if (known_eq (unrolling_factor, 1U)
1949 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1950 && DR_GROUP_GAP (first_stmt_info) == 0))
1951 SLP_TREE_LOAD_PERMUTATION (node).release ();
1952 else
1953 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1954 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1955 }
1956
1957 return true;
1958 }
1959
1960 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1961
1962 static void
1963 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1964 hash_set<slp_tree> &visited)
1965 {
1966 if (visited.add (node))
1967 return;
1968
1969 if (SLP_TREE_CHILDREN (node).length () == 0)
1970 {
1971 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1972 return;
1973 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1974 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1975 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1976 loads.safe_push (node);
1977 }
1978 else
1979 {
1980 unsigned i;
1981 slp_tree child;
1982 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1983 vect_gather_slp_loads (loads, child, visited);
1984 }
1985 }
1986
1987 static void
1988 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1989 {
1990 hash_set<slp_tree> visited;
1991 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1992 }
1993
1994
1995 /* Find the last store in SLP INSTANCE. */
1996
1997 stmt_vec_info
1998 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1999 {
2000 stmt_vec_info last = NULL;
2001 stmt_vec_info stmt_vinfo;
2002
2003 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2004 {
2005 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2006 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2007 }
2008
2009 return last;
2010 }
2011
2012 /* Find the first stmt in NODE. */
2013
2014 stmt_vec_info
2015 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2016 {
2017 stmt_vec_info first = NULL;
2018 stmt_vec_info stmt_vinfo;
2019
2020 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2021 {
2022 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2023 if (!first
2024 || get_later_stmt (stmt_vinfo, first) == first)
2025 first = stmt_vinfo;
2026 }
2027
2028 return first;
2029 }
2030
2031 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2032 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2033 (also containing the first GROUP1_SIZE stmts, since stores are
2034 consecutive), the second containing the remainder.
2035 Return the first stmt in the second group. */
2036
2037 static stmt_vec_info
2038 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2039 {
2040 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2041 gcc_assert (group1_size > 0);
2042 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2043 gcc_assert (group2_size > 0);
2044 DR_GROUP_SIZE (first_vinfo) = group1_size;
2045
2046 stmt_vec_info stmt_info = first_vinfo;
2047 for (unsigned i = group1_size; i > 1; i--)
2048 {
2049 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2050 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2051 }
2052 /* STMT is now the last element of the first group. */
2053 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2054 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2055
2056 DR_GROUP_SIZE (group2) = group2_size;
2057 for (stmt_info = group2; stmt_info;
2058 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2059 {
2060 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2061 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2062 }
2063
2064 /* For the second group, the DR_GROUP_GAP is that before the original group,
2065 plus skipping over the first vector. */
2066 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2067
2068 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2069 DR_GROUP_GAP (first_vinfo) += group2_size;
2070
2071 if (dump_enabled_p ())
2072 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2073 group1_size, group2_size);
2074
2075 return group2;
2076 }
2077
2078 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2079 statements and a vector of NUNITS elements. */
2080
2081 static poly_uint64
2082 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2083 {
2084 return exact_div (common_multiple (nunits, group_size), group_size);
2085 }
2086
2087 /* Analyze an SLP instance starting from a group of grouped stores. Call
2088 vect_build_slp_tree to build a tree of packed stmts if possible.
2089 Return FALSE if it's impossible to SLP any stmt in the loop. */
2090
2091 static bool
2092 vect_analyze_slp_instance (vec_info *vinfo,
2093 scalar_stmts_to_slp_tree_map_t *bst_map,
2094 stmt_vec_info stmt_info, unsigned max_tree_size)
2095 {
2096 slp_instance new_instance;
2097 slp_tree node;
2098 unsigned int group_size;
2099 tree vectype, scalar_type = NULL_TREE;
2100 unsigned int i;
2101 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2102 vec<stmt_vec_info> scalar_stmts;
2103 bool constructor = false;
2104
2105 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106 {
2107 scalar_type = TREE_TYPE (DR_REF (dr));
2108 group_size = DR_GROUP_SIZE (stmt_info);
2109 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2110 }
2111 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2112 {
2113 gcc_assert (is_a <loop_vec_info> (vinfo));
2114 vectype = STMT_VINFO_VECTYPE (stmt_info);
2115 group_size = REDUC_GROUP_SIZE (stmt_info);
2116 }
2117 else if (is_gimple_assign (stmt_info->stmt)
2118 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2119 {
2120 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2121 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2122 constructor = true;
2123 }
2124 else
2125 {
2126 gcc_assert (is_a <loop_vec_info> (vinfo));
2127 vectype = STMT_VINFO_VECTYPE (stmt_info);
2128 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2129 }
2130
2131 if (!vectype)
2132 {
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2135 "Build SLP failed: unsupported data-type %T\n",
2136 scalar_type);
2137
2138 return false;
2139 }
2140 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2141
2142 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2143 scalar_stmts.create (group_size);
2144 stmt_vec_info next_info = stmt_info;
2145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 {
2147 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2148 while (next_info)
2149 {
2150 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2151 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2152 }
2153 }
2154 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2155 {
2156 /* Collect the reduction stmts and store them in
2157 SLP_TREE_SCALAR_STMTS. */
2158 while (next_info)
2159 {
2160 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2161 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2162 }
2163 /* Mark the first element of the reduction chain as reduction to properly
2164 transform the node. In the reduction analysis phase only the last
2165 element of the chain is marked as reduction. */
2166 STMT_VINFO_DEF_TYPE (stmt_info)
2167 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2168 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2169 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2170 }
2171 else if (constructor)
2172 {
2173 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2174 tree val;
2175 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2176 {
2177 if (TREE_CODE (val) == SSA_NAME)
2178 {
2179 gimple* def = SSA_NAME_DEF_STMT (val);
2180 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2181 /* Value is defined in another basic block. */
2182 if (!def_info)
2183 return false;
2184 def_info = vect_stmt_to_vectorize (def_info);
2185 scalar_stmts.safe_push (def_info);
2186 }
2187 else
2188 return false;
2189 }
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE, vect_location,
2192 "Analyzing vectorizable constructor: %G\n",
2193 stmt_info->stmt);
2194 }
2195 else
2196 {
2197 /* Collect reduction statements. */
2198 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2199 for (i = 0; reductions.iterate (i, &next_info); i++)
2200 scalar_stmts.safe_push (next_info);
2201 }
2202
2203 /* Build the tree for the SLP instance. */
2204 bool *matches = XALLOCAVEC (bool, group_size);
2205 unsigned npermutes = 0;
2206 poly_uint64 max_nunits = nunits;
2207 unsigned tree_size = 0;
2208 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2209 &max_nunits, matches, &npermutes,
2210 &tree_size, bst_map);
2211 if (node != NULL)
2212 {
2213 /* Calculate the unrolling factor based on the smallest type. */
2214 poly_uint64 unrolling_factor
2215 = calculate_unrolling_factor (max_nunits, group_size);
2216
2217 if (maybe_ne (unrolling_factor, 1U)
2218 && is_a <bb_vec_info> (vinfo))
2219 {
2220 unsigned HOST_WIDE_INT const_max_nunits;
2221 if (!max_nunits.is_constant (&const_max_nunits)
2222 || const_max_nunits > group_size)
2223 {
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 "Build SLP failed: store group "
2227 "size not a multiple of the vector size "
2228 "in basic block SLP\n");
2229 vect_free_slp_tree (node, false);
2230 return false;
2231 }
2232 /* Fatal mismatch. */
2233 matches[0] = true;
2234 matches[group_size / const_max_nunits * const_max_nunits] = false;
2235 vect_free_slp_tree (node, false);
2236 }
2237 else
2238 {
2239 /* Create a new SLP instance. */
2240 new_instance = XNEW (class _slp_instance);
2241 SLP_INSTANCE_TREE (new_instance) = node;
2242 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2243 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2244 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2245
2246 vect_gather_slp_loads (new_instance, node);
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_NOTE, vect_location,
2249 "SLP size %u vs. limit %u.\n",
2250 tree_size, max_tree_size);
2251
2252 /* Check whether any load is possibly permuted. */
2253 slp_tree load_node;
2254 bool loads_permuted = false;
2255 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2256 {
2257 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2258 continue;
2259 unsigned j;
2260 stmt_vec_info load_info;
2261 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2262 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2263 {
2264 loads_permuted = true;
2265 break;
2266 }
2267 }
2268
2269 /* If the loads and stores can be handled with load/store-lane
2270 instructions do not generate this SLP instance. */
2271 if (is_a <loop_vec_info> (vinfo)
2272 && loads_permuted
2273 && dr && vect_store_lanes_supported (vectype, group_size, false))
2274 {
2275 slp_tree load_node;
2276 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2277 {
2278 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2279 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2280 /* Use SLP for strided accesses (or if we can't load-lanes). */
2281 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2282 || ! vect_load_lanes_supported
2283 (STMT_VINFO_VECTYPE (stmt_vinfo),
2284 DR_GROUP_SIZE (stmt_vinfo), false))
2285 break;
2286 }
2287 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2288 {
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "Built SLP cancelled: can use "
2292 "load/store-lanes\n");
2293 vect_free_slp_instance (new_instance, false);
2294 return false;
2295 }
2296 }
2297
2298 /* If this is a reduction chain with a conversion in front
2299 amend the SLP tree with a node for that. */
2300 if (!dr
2301 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2302 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2303 {
2304 /* Get at the conversion stmt - we know it's the single use
2305 of the last stmt of the reduction chain. */
2306 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2307 use_operand_p use_p;
2308 gimple *use_stmt;
2309 bool r = single_imm_use (gimple_assign_lhs (tem),
2310 &use_p, &use_stmt);
2311 gcc_assert (r);
2312 next_info = vinfo->lookup_stmt (use_stmt);
2313 next_info = vect_stmt_to_vectorize (next_info);
2314 scalar_stmts = vNULL;
2315 scalar_stmts.create (group_size);
2316 for (unsigned i = 0; i < group_size; ++i)
2317 scalar_stmts.quick_push (next_info);
2318 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2319 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2320 SLP_TREE_CHILDREN (conv).quick_push (node);
2321 SLP_INSTANCE_TREE (new_instance) = conv;
2322 /* We also have to fake this conversion stmt as SLP reduction
2323 group so we don't have to mess with too much code
2324 elsewhere. */
2325 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2326 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2327 }
2328
2329 vinfo->slp_instances.safe_push (new_instance);
2330
2331 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2332 the number of scalar stmts in the root in a few places.
2333 Verify that assumption holds. */
2334 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2335 .length () == group_size);
2336
2337 if (dump_enabled_p ())
2338 {
2339 dump_printf_loc (MSG_NOTE, vect_location,
2340 "Final SLP tree for instance:\n");
2341 vect_print_slp_graph (MSG_NOTE, vect_location,
2342 SLP_INSTANCE_TREE (new_instance));
2343 }
2344
2345 return true;
2346 }
2347 }
2348 else
2349 {
2350 /* Failed to SLP. */
2351 /* Free the allocated memory. */
2352 scalar_stmts.release ();
2353 }
2354
2355 /* For basic block SLP, try to break the group up into multiples of the
2356 vector size. */
2357 unsigned HOST_WIDE_INT const_nunits;
2358 if (is_a <bb_vec_info> (vinfo)
2359 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2360 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2361 && nunits.is_constant (&const_nunits))
2362 {
2363 /* We consider breaking the group only on VF boundaries from the existing
2364 start. */
2365 for (i = 0; i < group_size; i++)
2366 if (!matches[i]) break;
2367
2368 if (i >= const_nunits && i < group_size)
2369 {
2370 /* Split into two groups at the first vector boundary before i. */
2371 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2372 unsigned group1_size = i & ~(const_nunits - 1);
2373
2374 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2375 group1_size);
2376 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2377 max_tree_size);
2378 /* If the first non-match was in the middle of a vector,
2379 skip the rest of that vector. */
2380 if (group1_size < i)
2381 {
2382 i = group1_size + const_nunits;
2383 if (i < group_size)
2384 rest = vect_split_slp_store_group (rest, const_nunits);
2385 }
2386 if (i < group_size)
2387 res |= vect_analyze_slp_instance (vinfo, bst_map,
2388 rest, max_tree_size);
2389 return res;
2390 }
2391 /* Even though the first vector did not all match, we might be able to SLP
2392 (some) of the remainder. FORNOW ignore this possibility. */
2393 }
2394
2395 return false;
2396 }
2397
2398
2399 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2400 trees of packed scalar stmts if SLP is possible. */
2401
2402 opt_result
2403 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2404 {
2405 unsigned int i;
2406 stmt_vec_info first_element;
2407
2408 DUMP_VECT_SCOPE ("vect_analyze_slp");
2409
2410 scalar_stmts_to_slp_tree_map_t *bst_map
2411 = new scalar_stmts_to_slp_tree_map_t ();
2412
2413 /* Find SLP sequences starting from groups of grouped stores. */
2414 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2415 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2416
2417 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2418 {
2419 if (loop_vinfo->reduction_chains.length () > 0)
2420 {
2421 /* Find SLP sequences starting from reduction chains. */
2422 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2423 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2424 max_tree_size))
2425 {
2426 /* Dissolve reduction chain group. */
2427 stmt_vec_info vinfo = first_element;
2428 stmt_vec_info last = NULL;
2429 while (vinfo)
2430 {
2431 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2432 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2433 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2434 last = vinfo;
2435 vinfo = next;
2436 }
2437 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2438 /* It can be still vectorized as part of an SLP reduction. */
2439 loop_vinfo->reductions.safe_push (last);
2440 }
2441 }
2442
2443 /* Find SLP sequences starting from groups of reductions. */
2444 if (loop_vinfo->reductions.length () > 1)
2445 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2446 max_tree_size);
2447 }
2448
2449 /* The map keeps a reference on SLP nodes built, release that. */
2450 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2451 it != bst_map->end (); ++it)
2452 if ((*it).second)
2453 vect_free_slp_tree ((*it).second, false);
2454 delete bst_map;
2455
2456 /* Optimize permutations in SLP reductions. */
2457 slp_instance instance;
2458 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2459 {
2460 slp_tree node = SLP_INSTANCE_TREE (instance);
2461 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2462 /* Reduction (there are no data-refs in the root).
2463 In reduction chain the order of the loads is not important. */
2464 if (!STMT_VINFO_DATA_REF (stmt_info)
2465 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2466 vect_attempt_slp_rearrange_stmts (instance);
2467 }
2468
2469 /* Gather all loads in the SLP graph. */
2470 hash_set<slp_tree> visited;
2471 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2472 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2473 visited);
2474
2475 return opt_result::success ();
2476 }
2477
2478 void
2479 vect_optimize_slp (vec_info *vinfo)
2480 {
2481 slp_tree node;
2482 unsigned i;
2483 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2484 {
2485 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2486 continue;
2487
2488 /* In basic block vectorization we allow any subchain of an interleaving
2489 chain.
2490 FORNOW: not in loop SLP because of realignment complications. */
2491 if (is_a <bb_vec_info> (vinfo))
2492 {
2493 bool subchain_p = true;
2494 stmt_vec_info next_load_info = NULL;
2495 stmt_vec_info load_info;
2496 unsigned j;
2497 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2498 {
2499 if (j != 0
2500 && (next_load_info != load_info
2501 || DR_GROUP_GAP (load_info) != 1))
2502 {
2503 subchain_p = false;
2504 break;
2505 }
2506 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2507 }
2508 if (subchain_p)
2509 {
2510 SLP_TREE_LOAD_PERMUTATION (node).release ();
2511 continue;
2512 }
2513 }
2514 else
2515 {
2516 stmt_vec_info load_info;
2517 bool this_load_permuted = false;
2518 unsigned j;
2519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2520 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2521 {
2522 this_load_permuted = true;
2523 break;
2524 }
2525 stmt_vec_info first_stmt_info
2526 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2527 if (!this_load_permuted
2528 /* The load requires permutation when unrolling exposes
2529 a gap either because the group is larger than the SLP
2530 group-size or because there is a gap between the groups. */
2531 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2532 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2533 && DR_GROUP_GAP (first_stmt_info) == 0)))
2534 {
2535 SLP_TREE_LOAD_PERMUTATION (node).release ();
2536 continue;
2537 }
2538 }
2539 }
2540 }
2541
2542
2543 /* For each possible SLP instance decide whether to SLP it and calculate overall
2544 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2545 least one instance. */
2546
2547 bool
2548 vect_make_slp_decision (loop_vec_info loop_vinfo)
2549 {
2550 unsigned int i;
2551 poly_uint64 unrolling_factor = 1;
2552 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2553 slp_instance instance;
2554 int decided_to_slp = 0;
2555
2556 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2557
2558 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2559 {
2560 /* FORNOW: SLP if you can. */
2561 /* All unroll factors have the form:
2562
2563 GET_MODE_SIZE (vinfo->vector_mode) * X
2564
2565 for some rational X, so they must have a common multiple. */
2566 unrolling_factor
2567 = force_common_multiple (unrolling_factor,
2568 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2569
2570 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2571 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2572 loop-based vectorization. Such stmts will be marked as HYBRID. */
2573 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2574 decided_to_slp++;
2575 }
2576
2577 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2578
2579 if (decided_to_slp && dump_enabled_p ())
2580 {
2581 dump_printf_loc (MSG_NOTE, vect_location,
2582 "Decided to SLP %d instances. Unrolling factor ",
2583 decided_to_slp);
2584 dump_dec (MSG_NOTE, unrolling_factor);
2585 dump_printf (MSG_NOTE, "\n");
2586 }
2587
2588 return (decided_to_slp > 0);
2589 }
2590
2591 /* Private data for vect_detect_hybrid_slp. */
2592 struct vdhs_data
2593 {
2594 loop_vec_info loop_vinfo;
2595 vec<stmt_vec_info> *worklist;
2596 };
2597
2598 /* Walker for walk_gimple_op. */
2599
2600 static tree
2601 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2602 {
2603 walk_stmt_info *wi = (walk_stmt_info *)data;
2604 vdhs_data *dat = (vdhs_data *)wi->info;
2605
2606 if (wi->is_lhs)
2607 return NULL_TREE;
2608
2609 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2610 if (!def_stmt_info)
2611 return NULL_TREE;
2612 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2613 if (PURE_SLP_STMT (def_stmt_info))
2614 {
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2617 def_stmt_info->stmt);
2618 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2619 dat->worklist->safe_push (def_stmt_info);
2620 }
2621
2622 return NULL_TREE;
2623 }
2624
2625 /* Find stmts that must be both vectorized and SLPed. */
2626
2627 void
2628 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2629 {
2630 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2631
2632 /* All stmts participating in SLP are marked pure_slp, all other
2633 stmts are loop_vect.
2634 First collect all loop_vect stmts into a worklist. */
2635 auto_vec<stmt_vec_info> worklist;
2636 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2637 {
2638 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2639 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2640 gsi_next (&gsi))
2641 {
2642 gphi *phi = gsi.phi ();
2643 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2644 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2645 worklist.safe_push (stmt_info);
2646 }
2647 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2648 gsi_next (&gsi))
2649 {
2650 gimple *stmt = gsi_stmt (gsi);
2651 if (is_gimple_debug (stmt))
2652 continue;
2653 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2654 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2655 {
2656 for (gimple_stmt_iterator gsi2
2657 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2658 !gsi_end_p (gsi2); gsi_next (&gsi2))
2659 {
2660 stmt_vec_info patt_info
2661 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2662 if (!STMT_SLP_TYPE (patt_info)
2663 && STMT_VINFO_RELEVANT (patt_info))
2664 worklist.safe_push (patt_info);
2665 }
2666 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2667 }
2668 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2669 worklist.safe_push (stmt_info);
2670 }
2671 }
2672
2673 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2674 mark any SLP vectorized stmt as hybrid.
2675 ??? We're visiting def stmts N times (once for each non-SLP and
2676 once for each hybrid-SLP use). */
2677 walk_stmt_info wi;
2678 vdhs_data dat;
2679 dat.worklist = &worklist;
2680 dat.loop_vinfo = loop_vinfo;
2681 memset (&wi, 0, sizeof (wi));
2682 wi.info = (void *)&dat;
2683 while (!worklist.is_empty ())
2684 {
2685 stmt_vec_info stmt_info = worklist.pop ();
2686 /* Since SSA operands are not set up for pattern stmts we need
2687 to use walk_gimple_op. */
2688 wi.is_lhs = 0;
2689 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2690 }
2691 }
2692
2693
2694 /* Initialize a bb_vec_info struct for the statements between
2695 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2696
2697 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2698 gimple_stmt_iterator region_end_in,
2699 vec_info_shared *shared)
2700 : vec_info (vec_info::bb, init_cost (NULL), shared),
2701 bb (gsi_bb (region_begin_in)),
2702 region_begin (region_begin_in),
2703 region_end (region_end_in)
2704 {
2705 for (gimple *stmt : this->region_stmts ())
2706 {
2707 gimple_set_uid (stmt, 0);
2708 if (is_gimple_debug (stmt))
2709 continue;
2710 add_stmt (stmt);
2711 }
2712
2713 bb->aux = this;
2714 }
2715
2716
2717 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2718 stmts in the basic block. */
2719
2720 _bb_vec_info::~_bb_vec_info ()
2721 {
2722 for (gimple *stmt : this->region_stmts ())
2723 /* Reset region marker. */
2724 gimple_set_uid (stmt, -1);
2725
2726 bb->aux = NULL;
2727 }
2728
2729 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2730 given then that child nodes have already been processed, and that
2731 their def types currently match their SLP node's def type. */
2732
2733 static bool
2734 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2735 slp_instance node_instance,
2736 stmt_vector_for_cost *cost_vec)
2737 {
2738 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2739 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2740
2741 /* Calculate the number of vector statements to be created for the
2742 scalar stmts in this node. For SLP reductions it is equal to the
2743 number of vector statements in the children (which has already been
2744 calculated by the recursive call). Otherwise it is the number of
2745 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2746 VF divided by the number of elements in a vector. */
2747 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2748 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2749 {
2750 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2751 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2752 {
2753 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2754 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2755 break;
2756 }
2757 }
2758 else
2759 {
2760 poly_uint64 vf;
2761 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2762 vf = loop_vinfo->vectorization_factor;
2763 else
2764 vf = 1;
2765 unsigned int group_size = SLP_TREE_LANES (node);
2766 tree vectype = SLP_TREE_VECTYPE (node);
2767 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2768 = vect_get_num_vectors (vf * group_size, vectype);
2769 }
2770
2771 /* Handle purely internal nodes. */
2772 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2773 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2774
2775 bool dummy;
2776 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2777 node, node_instance, cost_vec);
2778 }
2779
2780 /* Try to build NODE from scalars, returning true on success.
2781 NODE_INSTANCE is the SLP instance that contains NODE. */
2782
2783 static bool
2784 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2785 slp_instance node_instance)
2786 {
2787 stmt_vec_info stmt_info;
2788 unsigned int i;
2789
2790 if (!is_a <bb_vec_info> (vinfo)
2791 || node == SLP_INSTANCE_TREE (node_instance)
2792 || !SLP_TREE_SCALAR_STMTS (node).exists ()
2793 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2794 return false;
2795
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "Building vector operands from scalars instead\n");
2799
2800 /* Don't remove and free the child nodes here, since they could be
2801 referenced by other structures. The analysis and scheduling phases
2802 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2803 unsigned int group_size = SLP_TREE_LANES (node);
2804 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2805 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2806 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2807 {
2808 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2809 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2810 }
2811 return true;
2812 }
2813
2814 /* Compute the prologue cost for invariant or constant operands represented
2815 by NODE. */
2816
2817 static void
2818 vect_prologue_cost_for_slp (slp_tree node,
2819 stmt_vector_for_cost *cost_vec)
2820 {
2821 /* There's a special case of an existing vector, that costs nothing. */
2822 if (SLP_TREE_SCALAR_OPS (node).length () == 0
2823 && !SLP_TREE_VEC_DEFS (node).is_empty ())
2824 return;
2825 /* Without looking at the actual initializer a vector of
2826 constants can be implemented as load from the constant pool.
2827 When all elements are the same we can use a splat. */
2828 tree vectype = SLP_TREE_VECTYPE (node);
2829 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2830 unsigned num_vects_to_check;
2831 unsigned HOST_WIDE_INT const_nunits;
2832 unsigned nelt_limit;
2833 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2834 && ! multiple_p (const_nunits, group_size))
2835 {
2836 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2837 nelt_limit = const_nunits;
2838 }
2839 else
2840 {
2841 /* If either the vector has variable length or the vectors
2842 are composed of repeated whole groups we only need to
2843 cost construction once. All vectors will be the same. */
2844 num_vects_to_check = 1;
2845 nelt_limit = group_size;
2846 }
2847 tree elt = NULL_TREE;
2848 unsigned nelt = 0;
2849 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2850 {
2851 unsigned si = j % group_size;
2852 if (nelt == 0)
2853 elt = SLP_TREE_SCALAR_OPS (node)[si];
2854 /* ??? We're just tracking whether all operands of a single
2855 vector initializer are the same, ideally we'd check if
2856 we emitted the same one already. */
2857 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2858 elt = NULL_TREE;
2859 nelt++;
2860 if (nelt == nelt_limit)
2861 {
2862 record_stmt_cost (cost_vec, 1,
2863 SLP_TREE_DEF_TYPE (node) == vect_external_def
2864 ? (elt ? scalar_to_vec : vec_construct)
2865 : vector_load,
2866 NULL, vectype, 0, vect_prologue);
2867 nelt = 0;
2868 }
2869 }
2870 }
2871
2872 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2873 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2874
2875 Return true if the operations are supported. */
2876
2877 static bool
2878 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2879 slp_instance node_instance,
2880 hash_set<slp_tree> &visited,
2881 hash_set<slp_tree> &lvisited,
2882 stmt_vector_for_cost *cost_vec)
2883 {
2884 int i, j;
2885 slp_tree child;
2886
2887 /* Assume we can code-generate all invariants. */
2888 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2889 return true;
2890
2891 /* If we already analyzed the exact same set of scalar stmts we're done.
2892 We share the generated vector stmts for those.
2893 The SLP graph is acyclic so not caching whether we failed or succeeded
2894 doesn't result in any issue since we throw away the lvisited set
2895 when we fail. */
2896 if (visited.contains (node)
2897 || lvisited.contains (node))
2898 return true;
2899
2900 bool res = true;
2901 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2902 {
2903 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
2904 visited, lvisited, cost_vec);
2905 if (!res)
2906 break;
2907 }
2908
2909 if (res)
2910 {
2911 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2912 cost_vec);
2913 if (res)
2914 lvisited.add (node);
2915 }
2916
2917 /* When the node can be vectorized cost invariant nodes it references.
2918 This is not done in DFS order to allow the refering node
2919 vectorizable_* calls to nail down the invariant nodes vector type
2920 and possibly unshare it if it needs a different vector type than
2921 other referrers. */
2922 if (res)
2923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2924 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2925 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2926 /* Perform usual caching, note code-generation still
2927 code-gens these nodes multiple times but we expect
2928 to CSE them later. */
2929 && !visited.contains (child)
2930 && !lvisited.add (child))
2931 {
2932 /* ??? After auditing more code paths make a "default"
2933 and push the vector type from NODE to all children
2934 if it is not already set. */
2935 /* Compute the number of vectors to be generated. */
2936 tree vector_type = SLP_TREE_VECTYPE (child);
2937 if (!vector_type)
2938 {
2939 /* For shifts with a scalar argument we don't need
2940 to cost or code-generate anything.
2941 ??? Represent this more explicitely. */
2942 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2943 == shift_vec_info_type)
2944 && j == 1);
2945 continue;
2946 }
2947 unsigned group_size = SLP_TREE_LANES (child);
2948 poly_uint64 vf = 1;
2949 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2950 vf = loop_vinfo->vectorization_factor;
2951 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2952 = vect_get_num_vectors (vf * group_size, vector_type);
2953 /* And cost them. */
2954 vect_prologue_cost_for_slp (child, cost_vec);
2955 }
2956
2957 /* If this node or any of its children can't be vectorized, try pruning
2958 the tree here rather than felling the whole thing. */
2959 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2960 {
2961 /* We'll need to revisit this for invariant costing and number
2962 of vectorized stmt setting. */
2963 res = true;
2964 }
2965
2966 return res;
2967 }
2968
2969
2970 /* Analyze statements in SLP instances of VINFO. Return true if the
2971 operations are supported. */
2972
2973 bool
2974 vect_slp_analyze_operations (vec_info *vinfo)
2975 {
2976 slp_instance instance;
2977 int i;
2978
2979 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2980
2981 hash_set<slp_tree> visited;
2982 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2983 {
2984 hash_set<slp_tree> lvisited;
2985 stmt_vector_for_cost cost_vec;
2986 cost_vec.create (2);
2987 if (!vect_slp_analyze_node_operations (vinfo,
2988 SLP_INSTANCE_TREE (instance),
2989 instance, visited, lvisited,
2990 &cost_vec)
2991 /* Instances with a root stmt require vectorized defs for the
2992 SLP tree root. */
2993 || (SLP_INSTANCE_ROOT_STMT (instance)
2994 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2995 != vect_internal_def)))
2996 {
2997 slp_tree node = SLP_INSTANCE_TREE (instance);
2998 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_NOTE, vect_location,
3001 "removing SLP instance operations starting from: %G",
3002 stmt_info->stmt);
3003 vect_free_slp_instance (instance, false);
3004 vinfo->slp_instances.ordered_remove (i);
3005 cost_vec.release ();
3006 }
3007 else
3008 {
3009 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3010 x != lvisited.end(); ++x)
3011 visited.add (*x);
3012 i++;
3013
3014 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3015 cost_vec.release ();
3016 }
3017 }
3018
3019 return !vinfo->slp_instances.is_empty ();
3020 }
3021
3022
3023 /* Compute the scalar cost of the SLP node NODE and its children
3024 and return it. Do not account defs that are marked in LIFE and
3025 update LIFE according to uses of NODE. */
3026
3027 static void
3028 vect_bb_slp_scalar_cost (vec_info *vinfo,
3029 slp_tree node, vec<bool, va_heap> *life,
3030 stmt_vector_for_cost *cost_vec,
3031 hash_set<slp_tree> &visited)
3032 {
3033 unsigned i;
3034 stmt_vec_info stmt_info;
3035 slp_tree child;
3036
3037 if (visited.add (node))
3038 return;
3039
3040 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3041 {
3042 gimple *stmt = stmt_info->stmt;
3043 ssa_op_iter op_iter;
3044 def_operand_p def_p;
3045
3046 if ((*life)[i])
3047 continue;
3048
3049 /* If there is a non-vectorized use of the defs then the scalar
3050 stmt is kept live in which case we do not account it or any
3051 required defs in the SLP children in the scalar cost. This
3052 way we make the vectorization more costly when compared to
3053 the scalar cost. */
3054 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
3055 {
3056 imm_use_iterator use_iter;
3057 gimple *use_stmt;
3058 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3059 if (!is_gimple_debug (use_stmt))
3060 {
3061 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3062 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
3063 {
3064 (*life)[i] = true;
3065 BREAK_FROM_IMM_USE_STMT (use_iter);
3066 }
3067 }
3068 }
3069 if ((*life)[i])
3070 continue;
3071
3072 /* Count scalar stmts only once. */
3073 if (gimple_visited_p (stmt))
3074 continue;
3075 gimple_set_visited (stmt, true);
3076
3077 vect_cost_for_stmt kind;
3078 if (STMT_VINFO_DATA_REF (stmt_info))
3079 {
3080 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3081 kind = scalar_load;
3082 else
3083 kind = scalar_store;
3084 }
3085 else if (vect_nop_conversion_p (stmt_info))
3086 continue;
3087 else
3088 kind = scalar_stmt;
3089 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3090 }
3091
3092 auto_vec<bool, 20> subtree_life;
3093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3094 {
3095 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3096 {
3097 /* Do not directly pass LIFE to the recursive call, copy it to
3098 confine changes in the callee to the current child/subtree. */
3099 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3100 {
3101 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
3102 for (unsigned j = 0;
3103 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3104 {
3105 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3106 if (perm.first == i)
3107 subtree_life[perm.second] = (*life)[j];
3108 }
3109 }
3110 else
3111 {
3112 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3113 subtree_life.safe_splice (*life);
3114 }
3115 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3116 visited);
3117 subtree_life.truncate (0);
3118 }
3119 }
3120 }
3121
3122 /* Check if vectorization of the basic block is profitable. */
3123
3124 static bool
3125 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3126 {
3127 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3128 slp_instance instance;
3129 int i;
3130 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3131 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3132
3133 /* Calculate scalar cost. */
3134 stmt_vector_for_cost scalar_costs;
3135 scalar_costs.create (0);
3136 hash_set<slp_tree> visited;
3137 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3138 {
3139 auto_vec<bool, 20> life;
3140 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3141 vect_bb_slp_scalar_cost (bb_vinfo,
3142 SLP_INSTANCE_TREE (instance),
3143 &life, &scalar_costs, visited);
3144 }
3145 /* Unset visited flag. */
3146 stmt_info_for_cost *si;
3147 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3148 gimple_set_visited (si->stmt_info->stmt, false);
3149
3150 void *target_cost_data = init_cost (NULL);
3151 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3152 scalar_costs.release ();
3153 unsigned dummy;
3154 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3155 destroy_cost_data (target_cost_data);
3156
3157 /* Complete the target-specific cost calculation. */
3158 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3159 &vec_inside_cost, &vec_epilogue_cost);
3160
3161 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3162
3163 if (dump_enabled_p ())
3164 {
3165 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3166 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3167 vec_inside_cost);
3168 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3169 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3170 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3171 }
3172
3173 /* Vectorization is profitable if its cost is more than the cost of scalar
3174 version. Note that we err on the vector side for equal cost because
3175 the cost estimate is otherwise quite pessimistic (constant uses are
3176 free on the scalar side but cost a load on the vector side for
3177 example). */
3178 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3179 return false;
3180
3181 return true;
3182 }
3183
3184 /* Find any vectorizable constructors and add them to the grouped_store
3185 array. */
3186
3187 static void
3188 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3189 {
3190 for (gimple *stmt : bb_vinfo->region_stmts ())
3191 {
3192 gassign *assign = dyn_cast<gassign *> (stmt);
3193 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3194 continue;
3195
3196 tree rhs = gimple_assign_rhs1 (assign);
3197 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3198 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3199 CONSTRUCTOR_NELTS (rhs))
3200 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3201 || uniform_vector_p (rhs))
3202 continue;
3203
3204 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3205 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3206 }
3207 }
3208
3209 /* Check if the region described by BB_VINFO can be vectorized, returning
3210 true if so. When returning false, set FATAL to true if the same failure
3211 would prevent vectorization at other vector sizes, false if it is still
3212 worth trying other sizes. N_STMTS is the number of statements in the
3213 region. */
3214
3215 static bool
3216 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3217 {
3218 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3219
3220 slp_instance instance;
3221 int i;
3222 poly_uint64 min_vf = 2;
3223
3224 /* The first group of checks is independent of the vector size. */
3225 fatal = true;
3226
3227 /* Analyze the data references. */
3228
3229 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3230 {
3231 if (dump_enabled_p ())
3232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3233 "not vectorized: unhandled data-ref in basic "
3234 "block.\n");
3235 return false;
3236 }
3237
3238 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3239 {
3240 if (dump_enabled_p ())
3241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3242 "not vectorized: unhandled data access in "
3243 "basic block.\n");
3244 return false;
3245 }
3246
3247 vect_slp_check_for_constructors (bb_vinfo);
3248
3249 /* If there are no grouped stores and no constructors in the region
3250 there is no need to continue with pattern recog as vect_analyze_slp
3251 will fail anyway. */
3252 if (bb_vinfo->grouped_stores.is_empty ())
3253 {
3254 if (dump_enabled_p ())
3255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3256 "not vectorized: no grouped stores in "
3257 "basic block.\n");
3258 return false;
3259 }
3260
3261 /* While the rest of the analysis below depends on it in some way. */
3262 fatal = false;
3263
3264 vect_pattern_recog (bb_vinfo);
3265
3266 /* Check the SLP opportunities in the basic block, analyze and build SLP
3267 trees. */
3268 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3269 {
3270 if (dump_enabled_p ())
3271 {
3272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273 "Failed to SLP the basic block.\n");
3274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3275 "not vectorized: failed to find SLP opportunities "
3276 "in basic block.\n");
3277 }
3278 return false;
3279 }
3280
3281 /* Optimize permutations. */
3282 vect_optimize_slp (bb_vinfo);
3283
3284 vect_record_base_alignments (bb_vinfo);
3285
3286 /* Analyze and verify the alignment of data references and the
3287 dependence in the SLP instances. */
3288 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3289 {
3290 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3291 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3292 {
3293 slp_tree node = SLP_INSTANCE_TREE (instance);
3294 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "removing SLP instance operations starting from: %G",
3298 stmt_info->stmt);
3299 vect_free_slp_instance (instance, false);
3300 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3301 continue;
3302 }
3303
3304 /* Mark all the statements that we want to vectorize as pure SLP and
3305 relevant. */
3306 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3307 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3308 if (SLP_INSTANCE_ROOT_STMT (instance))
3309 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3310
3311 i++;
3312 }
3313 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3314 return false;
3315
3316 if (!vect_slp_analyze_operations (bb_vinfo))
3317 {
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320 "not vectorized: bad operation in basic block.\n");
3321 return false;
3322 }
3323
3324 /* Cost model: check if the vectorization is worthwhile. */
3325 if (!unlimited_cost_model (NULL)
3326 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3327 {
3328 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3330 "not vectorized: vectorization is not "
3331 "profitable.\n");
3332 return false;
3333 }
3334
3335 if (dump_enabled_p ())
3336 dump_printf_loc (MSG_NOTE, vect_location,
3337 "Basic block will be vectorized using SLP\n");
3338 return true;
3339 }
3340
3341 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3342 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3343 on success. The region has N_STMTS statements and has the datarefs
3344 given by DATAREFS. */
3345
3346 static bool
3347 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3348 gimple_stmt_iterator region_end,
3349 vec<data_reference_p> datarefs,
3350 unsigned int n_stmts)
3351 {
3352 bb_vec_info bb_vinfo;
3353 auto_vector_modes vector_modes;
3354
3355 /* Autodetect first vector size we try. */
3356 machine_mode next_vector_mode = VOIDmode;
3357 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3358 unsigned int mode_i = 0;
3359
3360 vec_info_shared shared;
3361
3362 machine_mode autodetected_vector_mode = VOIDmode;
3363 while (1)
3364 {
3365 bool vectorized = false;
3366 bool fatal = false;
3367 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3368
3369 bool first_time_p = shared.datarefs.is_empty ();
3370 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3371 if (first_time_p)
3372 bb_vinfo->shared->save_datarefs ();
3373 else
3374 bb_vinfo->shared->check_datarefs ();
3375 bb_vinfo->vector_mode = next_vector_mode;
3376
3377 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3378 && dbg_cnt (vect_slp))
3379 {
3380 if (dump_enabled_p ())
3381 {
3382 dump_printf_loc (MSG_NOTE, vect_location,
3383 "***** Analysis succeeded with vector mode"
3384 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3385 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3386 }
3387
3388 bb_vinfo->shared->check_datarefs ();
3389 vect_schedule_slp (bb_vinfo);
3390
3391 unsigned HOST_WIDE_INT bytes;
3392 if (dump_enabled_p ())
3393 {
3394 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3395 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3396 "basic block part vectorized using %wu byte "
3397 "vectors\n", bytes);
3398 else
3399 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3400 "basic block part vectorized using variable "
3401 "length vectors\n");
3402 }
3403
3404 vectorized = true;
3405 }
3406 else
3407 {
3408 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_NOTE, vect_location,
3410 "***** Analysis failed with vector mode %s\n",
3411 GET_MODE_NAME (bb_vinfo->vector_mode));
3412 }
3413
3414 if (mode_i == 0)
3415 autodetected_vector_mode = bb_vinfo->vector_mode;
3416
3417 if (!fatal)
3418 while (mode_i < vector_modes.length ()
3419 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3420 {
3421 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_NOTE, vect_location,
3423 "***** The result for vector mode %s would"
3424 " be the same\n",
3425 GET_MODE_NAME (vector_modes[mode_i]));
3426 mode_i += 1;
3427 }
3428
3429 delete bb_vinfo;
3430
3431 if (mode_i < vector_modes.length ()
3432 && VECTOR_MODE_P (autodetected_vector_mode)
3433 && (related_vector_mode (vector_modes[mode_i],
3434 GET_MODE_INNER (autodetected_vector_mode))
3435 == autodetected_vector_mode)
3436 && (related_vector_mode (autodetected_vector_mode,
3437 GET_MODE_INNER (vector_modes[mode_i]))
3438 == vector_modes[mode_i]))
3439 {
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_NOTE, vect_location,
3442 "***** Skipping vector mode %s, which would"
3443 " repeat the analysis for %s\n",
3444 GET_MODE_NAME (vector_modes[mode_i]),
3445 GET_MODE_NAME (autodetected_vector_mode));
3446 mode_i += 1;
3447 }
3448
3449 if (vectorized
3450 || mode_i == vector_modes.length ()
3451 || autodetected_vector_mode == VOIDmode
3452 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3453 vector sizes will fail do not bother iterating. */
3454 || fatal)
3455 return vectorized;
3456
3457 /* Try the next biggest vector size. */
3458 next_vector_mode = vector_modes[mode_i++];
3459 if (dump_enabled_p ())
3460 dump_printf_loc (MSG_NOTE, vect_location,
3461 "***** Re-trying analysis with vector mode %s\n",
3462 GET_MODE_NAME (next_vector_mode));
3463 }
3464 }
3465
3466 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3467 true if anything in the basic-block was vectorized. */
3468
3469 bool
3470 vect_slp_bb (basic_block bb)
3471 {
3472 gimple_stmt_iterator gsi;
3473 bool any_vectorized = false;
3474
3475 gsi = gsi_after_labels (bb);
3476 while (!gsi_end_p (gsi))
3477 {
3478 gimple_stmt_iterator region_begin = gsi;
3479 vec<data_reference_p> datarefs = vNULL;
3480 int insns = 0;
3481
3482 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3483 {
3484 gimple *stmt = gsi_stmt (gsi);
3485 if (is_gimple_debug (stmt))
3486 {
3487 /* Skip leading debug stmts. */
3488 if (gsi_stmt (region_begin) == stmt)
3489 gsi_next (&region_begin);
3490 continue;
3491 }
3492 insns++;
3493
3494 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3495 vect_location = stmt;
3496
3497 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3498 break;
3499 }
3500 if (gsi_end_p (region_begin))
3501 break;
3502
3503 /* Skip leading unhandled stmts. */
3504 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3505 {
3506 gsi_next (&gsi);
3507 continue;
3508 }
3509
3510 gimple_stmt_iterator region_end = gsi;
3511
3512 if (insns > param_slp_max_insns_in_bb)
3513 {
3514 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3516 "not vectorized: too many instructions in "
3517 "basic block.\n");
3518 }
3519 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3520 any_vectorized = true;
3521
3522 if (gsi_end_p (region_end))
3523 break;
3524
3525 /* Skip the unhandled stmt. */
3526 gsi_next (&gsi);
3527 }
3528
3529 return any_vectorized;
3530 }
3531
3532
3533 /* Build a variable-length vector in which the elements in ELTS are repeated
3534 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3535 RESULTS and add any new instructions to SEQ.
3536
3537 The approach we use is:
3538
3539 (1) Find a vector mode VM with integer elements of mode IM.
3540
3541 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3542 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3543 from small vectors to IM.
3544
3545 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3546
3547 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3548 correct byte contents.
3549
3550 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3551
3552 We try to find the largest IM for which this sequence works, in order
3553 to cut down on the number of interleaves. */
3554
3555 void
3556 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3557 vec<tree> elts, unsigned int nresults,
3558 vec<tree> &results)
3559 {
3560 unsigned int nelts = elts.length ();
3561 tree element_type = TREE_TYPE (vector_type);
3562
3563 /* (1) Find a vector mode VM with integer elements of mode IM. */
3564 unsigned int nvectors = 1;
3565 tree new_vector_type;
3566 tree permutes[2];
3567 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3568 &nvectors, &new_vector_type,
3569 permutes))
3570 gcc_unreachable ();
3571
3572 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3573 unsigned int partial_nelts = nelts / nvectors;
3574 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3575
3576 tree_vector_builder partial_elts;
3577 auto_vec<tree, 32> pieces (nvectors * 2);
3578 pieces.quick_grow (nvectors * 2);
3579 for (unsigned int i = 0; i < nvectors; ++i)
3580 {
3581 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3582 ELTS' has mode IM. */
3583 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3584 for (unsigned int j = 0; j < partial_nelts; ++j)
3585 partial_elts.quick_push (elts[i * partial_nelts + j]);
3586 tree t = gimple_build_vector (seq, &partial_elts);
3587 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3588 TREE_TYPE (new_vector_type), t);
3589
3590 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3591 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3592 }
3593
3594 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3595 correct byte contents.
3596
3597 We need to repeat the following operation log2(nvectors) times:
3598
3599 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3600 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3601
3602 However, if each input repeats every N elements and the VF is
3603 a multiple of N * 2, the HI result is the same as the LO. */
3604 unsigned int in_start = 0;
3605 unsigned int out_start = nvectors;
3606 unsigned int hi_start = nvectors / 2;
3607 /* A bound on the number of outputs needed to produce NRESULTS results
3608 in the final iteration. */
3609 unsigned int noutputs_bound = nvectors * nresults;
3610 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3611 {
3612 noutputs_bound /= 2;
3613 unsigned int limit = MIN (noutputs_bound, nvectors);
3614 for (unsigned int i = 0; i < limit; ++i)
3615 {
3616 if ((i & 1) != 0
3617 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3618 2 * in_repeat))
3619 {
3620 pieces[out_start + i] = pieces[out_start + i - 1];
3621 continue;
3622 }
3623
3624 tree output = make_ssa_name (new_vector_type);
3625 tree input1 = pieces[in_start + (i / 2)];
3626 tree input2 = pieces[in_start + (i / 2) + hi_start];
3627 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3628 input1, input2,
3629 permutes[i & 1]);
3630 gimple_seq_add_stmt (seq, stmt);
3631 pieces[out_start + i] = output;
3632 }
3633 std::swap (in_start, out_start);
3634 }
3635
3636 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3637 results.reserve (nresults);
3638 for (unsigned int i = 0; i < nresults; ++i)
3639 if (i < nvectors)
3640 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3641 pieces[in_start + i]));
3642 else
3643 results.quick_push (results[i - nvectors]);
3644 }
3645
3646
3647 /* For constant and loop invariant defs in OP_NODE this function creates
3648 vector defs that will be used in the vectorized stmts and stores them
3649 to SLP_TREE_VEC_DEFS of OP_NODE. */
3650
3651 static void
3652 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3653 {
3654 unsigned HOST_WIDE_INT nunits;
3655 tree vec_cst;
3656 unsigned j, number_of_places_left_in_vector;
3657 tree vector_type;
3658 tree vop;
3659 int group_size = op_node->ops.length ();
3660 unsigned int vec_num, i;
3661 unsigned number_of_copies = 1;
3662 bool constant_p;
3663 gimple_seq ctor_seq = NULL;
3664 auto_vec<tree, 16> permute_results;
3665
3666 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3667 vector_type = SLP_TREE_VECTYPE (op_node);
3668
3669 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3670 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3671 auto_vec<tree> voprnds (number_of_vectors);
3672
3673 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3674 created vectors. It is greater than 1 if unrolling is performed.
3675
3676 For example, we have two scalar operands, s1 and s2 (e.g., group of
3677 strided accesses of size two), while NUNITS is four (i.e., four scalars
3678 of this type can be packed in a vector). The output vector will contain
3679 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3680 will be 2).
3681
3682 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3683 containing the operands.
3684
3685 For example, NUNITS is four as before, and the group size is 8
3686 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3687 {s5, s6, s7, s8}. */
3688
3689 /* When using duplicate_and_interleave, we just need one element for
3690 each scalar statement. */
3691 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3692 nunits = group_size;
3693
3694 number_of_copies = nunits * number_of_vectors / group_size;
3695
3696 number_of_places_left_in_vector = nunits;
3697 constant_p = true;
3698 tree_vector_builder elts (vector_type, nunits, 1);
3699 elts.quick_grow (nunits);
3700 stmt_vec_info insert_after = NULL;
3701 for (j = 0; j < number_of_copies; j++)
3702 {
3703 tree op;
3704 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3705 {
3706 /* Create 'vect_ = {op0,op1,...,opn}'. */
3707 number_of_places_left_in_vector--;
3708 tree orig_op = op;
3709 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3710 {
3711 if (CONSTANT_CLASS_P (op))
3712 {
3713 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3714 {
3715 /* Can't use VIEW_CONVERT_EXPR for booleans because
3716 of possibly different sizes of scalar value and
3717 vector element. */
3718 if (integer_zerop (op))
3719 op = build_int_cst (TREE_TYPE (vector_type), 0);
3720 else if (integer_onep (op))
3721 op = build_all_ones_cst (TREE_TYPE (vector_type));
3722 else
3723 gcc_unreachable ();
3724 }
3725 else
3726 op = fold_unary (VIEW_CONVERT_EXPR,
3727 TREE_TYPE (vector_type), op);
3728 gcc_assert (op && CONSTANT_CLASS_P (op));
3729 }
3730 else
3731 {
3732 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3733 gimple *init_stmt;
3734 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3735 {
3736 tree true_val
3737 = build_all_ones_cst (TREE_TYPE (vector_type));
3738 tree false_val
3739 = build_zero_cst (TREE_TYPE (vector_type));
3740 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3741 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3742 op, true_val,
3743 false_val);
3744 }
3745 else
3746 {
3747 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3748 op);
3749 init_stmt
3750 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3751 op);
3752 }
3753 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3754 op = new_temp;
3755 }
3756 }
3757 elts[number_of_places_left_in_vector] = op;
3758 if (!CONSTANT_CLASS_P (op))
3759 constant_p = false;
3760 /* For BB vectorization we have to compute an insert location
3761 when a def is inside the analyzed region since we cannot
3762 simply insert at the BB start in this case. */
3763 stmt_vec_info opdef;
3764 if (TREE_CODE (orig_op) == SSA_NAME
3765 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3766 && is_a <bb_vec_info> (vinfo)
3767 && (opdef = vinfo->lookup_def (orig_op)))
3768 {
3769 if (!insert_after)
3770 insert_after = opdef;
3771 else
3772 insert_after = get_later_stmt (insert_after, opdef);
3773 }
3774
3775 if (number_of_places_left_in_vector == 0)
3776 {
3777 if (constant_p
3778 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3779 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3780 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3781 else
3782 {
3783 if (permute_results.is_empty ())
3784 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3785 elts, number_of_vectors,
3786 permute_results);
3787 vec_cst = permute_results[number_of_vectors - j - 1];
3788 }
3789 if (!gimple_seq_empty_p (ctor_seq))
3790 {
3791 if (insert_after)
3792 {
3793 gimple_stmt_iterator gsi
3794 = gsi_for_stmt (insert_after->stmt);
3795 gsi_insert_seq_after (&gsi, ctor_seq,
3796 GSI_CONTINUE_LINKING);
3797 }
3798 else
3799 vinfo->insert_seq_on_entry (NULL, ctor_seq);
3800 ctor_seq = NULL;
3801 }
3802 voprnds.quick_push (vec_cst);
3803 insert_after = NULL;
3804 number_of_places_left_in_vector = nunits;
3805 constant_p = true;
3806 elts.new_vector (vector_type, nunits, 1);
3807 elts.quick_grow (nunits);
3808 }
3809 }
3810 }
3811
3812 /* Since the vectors are created in the reverse order, we should invert
3813 them. */
3814 vec_num = voprnds.length ();
3815 for (j = vec_num; j != 0; j--)
3816 {
3817 vop = voprnds[j - 1];
3818 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3819 }
3820
3821 /* In case that VF is greater than the unrolling factor needed for the SLP
3822 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3823 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3824 to replicate the vectors. */
3825 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3826 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3827 i++)
3828 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3829 }
3830
3831 /* Get the Ith vectorized definition from SLP_NODE. */
3832
3833 tree
3834 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3835 {
3836 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3837 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3838 else
3839 return SLP_TREE_VEC_DEFS (slp_node)[i];
3840 }
3841
3842 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3843
3844 void
3845 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3846 {
3847 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3848 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3849 {
3850 unsigned j;
3851 gimple *vec_def_stmt;
3852 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3853 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3854 }
3855 else
3856 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3857 }
3858
3859 /* Get N vectorized definitions for SLP_NODE. */
3860
3861 void
3862 vect_get_slp_defs (vec_info *,
3863 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3864 {
3865 if (n == -1U)
3866 n = SLP_TREE_CHILDREN (slp_node).length ();
3867
3868 for (unsigned i = 0; i < n; ++i)
3869 {
3870 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3871 vec<tree> vec_defs = vNULL;
3872 vect_get_slp_defs (child, &vec_defs);
3873 vec_oprnds->quick_push (vec_defs);
3874 }
3875 }
3876
3877 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3878 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3879 permute statements for the SLP node NODE. */
3880
3881 bool
3882 vect_transform_slp_perm_load (vec_info *vinfo,
3883 slp_tree node, vec<tree> dr_chain,
3884 gimple_stmt_iterator *gsi, poly_uint64 vf,
3885 bool analyze_only, unsigned *n_perms)
3886 {
3887 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3888 int vec_index = 0;
3889 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3890 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3891 unsigned int mask_element;
3892 machine_mode mode;
3893
3894 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3895 return false;
3896
3897 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3898
3899 mode = TYPE_MODE (vectype);
3900 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3901
3902 /* Initialize the vect stmts of NODE to properly insert the generated
3903 stmts later. */
3904 if (! analyze_only)
3905 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3906 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3907 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3908
3909 /* Generate permutation masks for every NODE. Number of masks for each NODE
3910 is equal to GROUP_SIZE.
3911 E.g., we have a group of three nodes with three loads from the same
3912 location in each node, and the vector size is 4. I.e., we have a
3913 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3914 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3915 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3916 ...
3917
3918 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3919 The last mask is illegal since we assume two operands for permute
3920 operation, and the mask element values can't be outside that range.
3921 Hence, the last mask must be converted into {2,5,5,5}.
3922 For the first two permutations we need the first and the second input
3923 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3924 we need the second and the third vectors: {b1,c1,a2,b2} and
3925 {c2,a3,b3,c3}. */
3926
3927 int vect_stmts_counter = 0;
3928 unsigned int index = 0;
3929 int first_vec_index = -1;
3930 int second_vec_index = -1;
3931 bool noop_p = true;
3932 *n_perms = 0;
3933
3934 vec_perm_builder mask;
3935 unsigned int nelts_to_build;
3936 unsigned int nvectors_per_build;
3937 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3938 && multiple_p (nunits, group_size));
3939 if (repeating_p)
3940 {
3941 /* A single vector contains a whole number of copies of the node, so:
3942 (a) all permutes can use the same mask; and
3943 (b) the permutes only need a single vector input. */
3944 mask.new_vector (nunits, group_size, 3);
3945 nelts_to_build = mask.encoded_nelts ();
3946 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3947 }
3948 else
3949 {
3950 /* We need to construct a separate mask for each vector statement. */
3951 unsigned HOST_WIDE_INT const_nunits, const_vf;
3952 if (!nunits.is_constant (&const_nunits)
3953 || !vf.is_constant (&const_vf))
3954 return false;
3955 mask.new_vector (const_nunits, const_nunits, 1);
3956 nelts_to_build = const_vf * group_size;
3957 nvectors_per_build = 1;
3958 }
3959
3960 unsigned int count = mask.encoded_nelts ();
3961 mask.quick_grow (count);
3962 vec_perm_indices indices;
3963
3964 for (unsigned int j = 0; j < nelts_to_build; j++)
3965 {
3966 unsigned int iter_num = j / group_size;
3967 unsigned int stmt_num = j % group_size;
3968 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3969 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3970 if (repeating_p)
3971 {
3972 first_vec_index = 0;
3973 mask_element = i;
3974 }
3975 else
3976 {
3977 /* Enforced before the loop when !repeating_p. */
3978 unsigned int const_nunits = nunits.to_constant ();
3979 vec_index = i / const_nunits;
3980 mask_element = i % const_nunits;
3981 if (vec_index == first_vec_index
3982 || first_vec_index == -1)
3983 {
3984 first_vec_index = vec_index;
3985 }
3986 else if (vec_index == second_vec_index
3987 || second_vec_index == -1)
3988 {
3989 second_vec_index = vec_index;
3990 mask_element += const_nunits;
3991 }
3992 else
3993 {
3994 if (dump_enabled_p ())
3995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3996 "permutation requires at "
3997 "least three vectors %G",
3998 stmt_info->stmt);
3999 gcc_assert (analyze_only);
4000 return false;
4001 }
4002
4003 gcc_assert (mask_element < 2 * const_nunits);
4004 }
4005
4006 if (mask_element != index)
4007 noop_p = false;
4008 mask[index++] = mask_element;
4009
4010 if (index == count && !noop_p)
4011 {
4012 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4013 if (!can_vec_perm_const_p (mode, indices))
4014 {
4015 if (dump_enabled_p ())
4016 {
4017 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4018 vect_location,
4019 "unsupported vect permute { ");
4020 for (i = 0; i < count; ++i)
4021 {
4022 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4023 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4024 }
4025 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4026 }
4027 gcc_assert (analyze_only);
4028 return false;
4029 }
4030
4031 ++*n_perms;
4032 }
4033
4034 if (index == count)
4035 {
4036 if (!analyze_only)
4037 {
4038 tree mask_vec = NULL_TREE;
4039
4040 if (! noop_p)
4041 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4042
4043 if (second_vec_index == -1)
4044 second_vec_index = first_vec_index;
4045
4046 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4047 {
4048 /* Generate the permute statement if necessary. */
4049 tree first_vec = dr_chain[first_vec_index + ri];
4050 tree second_vec = dr_chain[second_vec_index + ri];
4051 gimple *perm_stmt;
4052 if (! noop_p)
4053 {
4054 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4055 tree perm_dest
4056 = vect_create_destination_var (gimple_assign_lhs (stmt),
4057 vectype);
4058 perm_dest = make_ssa_name (perm_dest);
4059 perm_stmt
4060 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4061 first_vec, second_vec,
4062 mask_vec);
4063 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4064 gsi);
4065 }
4066 else
4067 /* If mask was NULL_TREE generate the requested
4068 identity transform. */
4069 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4070
4071 /* Store the vector statement in NODE. */
4072 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4073 }
4074 }
4075
4076 index = 0;
4077 first_vec_index = -1;
4078 second_vec_index = -1;
4079 noop_p = true;
4080 }
4081 }
4082
4083 return true;
4084 }
4085
4086
4087 /* Vectorize the SLP permutations in NODE as specified
4088 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4089 child number and lane number.
4090 Interleaving of two two-lane two-child SLP subtrees (not supported):
4091 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4092 A blend of two four-lane two-child SLP subtrees:
4093 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4094 Highpart of a four-lane one-child SLP subtree (not supported):
4095 [ { 0, 2 }, { 0, 3 } ]
4096 Where currently only a subset is supported by code generating below. */
4097
4098 static bool
4099 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4100 slp_tree node, stmt_vector_for_cost *cost_vec)
4101 {
4102 tree vectype = SLP_TREE_VECTYPE (node);
4103
4104 /* ??? We currently only support all same vector input and output types
4105 while the SLP IL should really do a concat + select and thus accept
4106 arbitrary mismatches. */
4107 slp_tree child;
4108 unsigned i;
4109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4110 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4111 {
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114 "Unsupported lane permutation\n");
4115 return false;
4116 }
4117
4118 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4119 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4120 if (dump_enabled_p ())
4121 {
4122 dump_printf_loc (MSG_NOTE, vect_location,
4123 "vectorizing permutation");
4124 for (unsigned i = 0; i < perm.length (); ++i)
4125 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4126 dump_printf (MSG_NOTE, "\n");
4127 }
4128
4129 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4130 if (!nunits.is_constant ())
4131 return false;
4132 unsigned HOST_WIDE_INT vf = 1;
4133 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4134 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4135 return false;
4136 unsigned olanes = vf * SLP_TREE_LANES (node);
4137 gcc_assert (multiple_p (olanes, nunits));
4138
4139 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4140 from the { SLP operand, scalar lane } permutation as recorded in the
4141 SLP node as intermediate step. This part should already work
4142 with SLP children with arbitrary number of lanes. */
4143 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4144 auto_vec<unsigned> active_lane;
4145 vperm.create (olanes);
4146 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4147 for (unsigned i = 0; i < vf; ++i)
4148 {
4149 for (unsigned pi = 0; pi < perm.length (); ++pi)
4150 {
4151 std::pair<unsigned, unsigned> p = perm[pi];
4152 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4153 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4154 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4155 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4156 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4157 }
4158 /* Advance to the next group. */
4159 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4160 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4161 }
4162
4163 if (dump_enabled_p ())
4164 {
4165 dump_printf_loc (MSG_NOTE, vect_location, "as");
4166 for (unsigned i = 0; i < vperm.length (); ++i)
4167 {
4168 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4169 dump_printf (MSG_NOTE, ",");
4170 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4171 vperm[i].first.first, vperm[i].first.second,
4172 vperm[i].first.second);
4173 }
4174 dump_printf (MSG_NOTE, "\n");
4175 }
4176
4177 /* We can only handle two-vector permutes, everything else should
4178 be lowered on the SLP level. The following is closely inspired
4179 by vect_transform_slp_perm_load and is supposed to eventually
4180 replace it.
4181 ??? As intermediate step do code-gen in the SLP tree representation
4182 somehow? */
4183 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4184 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4185 unsigned int const_nunits = nunits.to_constant ();
4186 unsigned int index = 0;
4187 unsigned int mask_element;
4188 vec_perm_builder mask;
4189 mask.new_vector (const_nunits, const_nunits, 1);
4190 unsigned int count = mask.encoded_nelts ();
4191 mask.quick_grow (count);
4192 vec_perm_indices indices;
4193 unsigned nperms = 0;
4194 for (unsigned i = 0; i < vperm.length (); ++i)
4195 {
4196 mask_element = vperm[i].second;
4197 if (first_vec.first == -1U
4198 || first_vec == vperm[i].first)
4199 first_vec = vperm[i].first;
4200 else if (second_vec.first == -1U
4201 || second_vec == vperm[i].first)
4202 {
4203 second_vec = vperm[i].first;
4204 mask_element += const_nunits;
4205 }
4206 else
4207 {
4208 if (dump_enabled_p ())
4209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4210 "permutation requires at "
4211 "least three vectors");
4212 gcc_assert (!gsi);
4213 return false;
4214 }
4215
4216 mask[index++] = mask_element;
4217
4218 if (index == count)
4219 {
4220 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4221 const_nunits);
4222 bool identity_p = indices.series_p (0, 1, 0, 1);
4223 if (!identity_p
4224 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4225 {
4226 if (dump_enabled_p ())
4227 {
4228 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4229 vect_location,
4230 "unsupported vect permute { ");
4231 for (i = 0; i < count; ++i)
4232 {
4233 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4234 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4235 }
4236 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4237 }
4238 gcc_assert (!gsi);
4239 return false;
4240 }
4241
4242 if (!identity_p)
4243 nperms++;
4244 if (gsi)
4245 {
4246 if (second_vec.first == -1U)
4247 second_vec = first_vec;
4248
4249 /* Generate the permute statement if necessary. */
4250 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4251 tree first_def
4252 = vect_get_slp_vect_def (first_node, first_vec.second);
4253 gassign *perm_stmt;
4254 tree perm_dest = make_ssa_name (vectype);
4255 if (!identity_p)
4256 {
4257 slp_tree second_node
4258 = SLP_TREE_CHILDREN (node)[second_vec.first];
4259 tree second_def
4260 = vect_get_slp_vect_def (second_node, second_vec.second);
4261 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4262 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4263 first_def, second_def,
4264 mask_vec);
4265 }
4266 else
4267 /* We need a copy here in case the def was external. */
4268 perm_stmt = gimple_build_assign (perm_dest, first_def);
4269 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4270 /* Store the vector statement in NODE. */
4271 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4272 }
4273
4274 index = 0;
4275 first_vec = std::make_pair (-1U, -1U);
4276 second_vec = std::make_pair (-1U, -1U);
4277 }
4278 }
4279
4280 if (!gsi)
4281 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4282
4283 return true;
4284 }
4285
4286 /* Vectorize SLP instance tree in postorder. */
4287
4288 static void
4289 vect_schedule_slp_instance (vec_info *vinfo,
4290 slp_tree node, slp_instance instance)
4291 {
4292 gimple_stmt_iterator si;
4293 int i;
4294 slp_tree child;
4295
4296 /* See if we have already vectorized the node in the graph of the
4297 SLP instance. */
4298 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4299 && SLP_TREE_VEC_STMTS (node).exists ())
4300 || SLP_TREE_VEC_DEFS (node).exists ())
4301 return;
4302
4303 /* Vectorize externals and constants. */
4304 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4305 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4306 {
4307 /* ??? vectorizable_shift can end up using a scalar operand which is
4308 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4309 node in this case. */
4310 if (!SLP_TREE_VECTYPE (node))
4311 return;
4312
4313 vect_create_constant_vectors (vinfo, node);
4314 return;
4315 }
4316
4317 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4318 vect_schedule_slp_instance (vinfo, child, instance);
4319
4320 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4321 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4322
4323 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_NOTE, vect_location,
4326 "------>vectorizing SLP node starting from: %G",
4327 stmt_info->stmt);
4328
4329 if (STMT_VINFO_DATA_REF (stmt_info)
4330 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4331 {
4332 /* Vectorized loads go before the first scalar load to make it
4333 ready early, vectorized stores go before the last scalar
4334 stmt which is where all uses are ready. */
4335 stmt_vec_info last_stmt_info = NULL;
4336 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4337 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4338 else /* DR_IS_WRITE */
4339 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4340 si = gsi_for_stmt (last_stmt_info->stmt);
4341 }
4342 else if (SLP_TREE_CHILDREN (node).is_empty ())
4343 {
4344 /* This happens for reduction and induction PHIs where we do not use the
4345 insertion iterator. */
4346 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4347 == cycle_phi_info_type
4348 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4349 == induc_vec_info_type));
4350 si = gsi_none ();
4351 }
4352 else
4353 {
4354 /* Emit other stmts after the children vectorized defs which is
4355 earliest possible. */
4356 gimple *last_stmt = NULL;
4357 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4358 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4359 {
4360 /* For fold-left reductions we are retaining the scalar
4361 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4362 set so the representation isn't perfect. Resort to the
4363 last scalar def here. */
4364 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4365 {
4366 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4367 == cycle_phi_info_type);
4368 gphi *phi = as_a <gphi *>
4369 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4370 if (!last_stmt
4371 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4372 last_stmt = phi;
4373 }
4374 /* We are emitting all vectorized stmts in the same place and
4375 the last one is the last.
4376 ??? Unless we have a load permutation applied and that
4377 figures to re-use an earlier generated load. */
4378 unsigned j;
4379 gimple *vstmt;
4380 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4381 if (!last_stmt
4382 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4383 last_stmt = vstmt;
4384 }
4385 else if (!SLP_TREE_VECTYPE (child))
4386 {
4387 /* For externals we use unvectorized at all scalar defs. */
4388 unsigned j;
4389 tree def;
4390 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4391 if (TREE_CODE (def) == SSA_NAME
4392 && !SSA_NAME_IS_DEFAULT_DEF (def))
4393 {
4394 gimple *stmt = SSA_NAME_DEF_STMT (def);
4395 if (!last_stmt
4396 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4397 last_stmt = stmt;
4398 }
4399 }
4400 else
4401 {
4402 /* For externals we have to look at all defs since their
4403 insertion place is decided per vector. */
4404 unsigned j;
4405 tree vdef;
4406 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4407 if (TREE_CODE (vdef) == SSA_NAME
4408 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
4409 {
4410 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4411 if (!last_stmt
4412 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4413 last_stmt = vstmt;
4414 }
4415 }
4416 /* This can happen when all children are pre-existing vectors or
4417 constants. */
4418 if (!last_stmt)
4419 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
4420 if (is_a <gphi *> (last_stmt))
4421 si = gsi_after_labels (gimple_bb (last_stmt));
4422 else
4423 {
4424 si = gsi_for_stmt (last_stmt);
4425 gsi_next (&si);
4426 }
4427 }
4428
4429 bool done_p = false;
4430
4431 /* Handle purely internal nodes. */
4432 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4433 {
4434 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4435 be shared with different SLP nodes (but usually it's the same
4436 operation apart from the case the stmt is only there for denoting
4437 the actual scalar lane defs ...). So do not call vect_transform_stmt
4438 but open-code it here (partly). */
4439 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4440 gcc_assert (done);
4441 done_p = true;
4442 }
4443 if (!done_p)
4444 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4445 }
4446
4447 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4448 For loop vectorization this is done in vectorizable_call, but for SLP
4449 it needs to be deferred until end of vect_schedule_slp, because multiple
4450 SLP instances may refer to the same scalar stmt. */
4451
4452 static void
4453 vect_remove_slp_scalar_calls (vec_info *vinfo,
4454 slp_tree node, hash_set<slp_tree> &visited)
4455 {
4456 gimple *new_stmt;
4457 gimple_stmt_iterator gsi;
4458 int i;
4459 slp_tree child;
4460 tree lhs;
4461 stmt_vec_info stmt_info;
4462
4463 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4464 return;
4465
4466 if (visited.add (node))
4467 return;
4468
4469 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4470 vect_remove_slp_scalar_calls (vinfo, child, visited);
4471
4472 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4473 {
4474 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4475 if (!stmt || gimple_bb (stmt) == NULL)
4476 continue;
4477 if (is_pattern_stmt_p (stmt_info)
4478 || !PURE_SLP_STMT (stmt_info))
4479 continue;
4480 lhs = gimple_call_lhs (stmt);
4481 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4482 gsi = gsi_for_stmt (stmt);
4483 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4484 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4485 }
4486 }
4487
4488 static void
4489 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4490 {
4491 hash_set<slp_tree> visited;
4492 vect_remove_slp_scalar_calls (vinfo, node, visited);
4493 }
4494
4495 /* Vectorize the instance root. */
4496
4497 void
4498 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4499 {
4500 gassign *rstmt = NULL;
4501
4502 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4503 {
4504 gimple *child_stmt;
4505 int j;
4506
4507 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4508 {
4509 tree vect_lhs = gimple_get_lhs (child_stmt);
4510 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4511 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4512 TREE_TYPE (vect_lhs)))
4513 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4514 vect_lhs);
4515 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4516 break;
4517 }
4518 }
4519 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4520 {
4521 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4522 gimple *child_stmt;
4523 int j;
4524 vec<constructor_elt, va_gc> *v;
4525 vec_alloc (v, nelts);
4526
4527 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4528 {
4529 CONSTRUCTOR_APPEND_ELT (v,
4530 NULL_TREE,
4531 gimple_get_lhs (child_stmt));
4532 }
4533 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4534 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4535 tree r_constructor = build_constructor (rtype, v);
4536 rstmt = gimple_build_assign (lhs, r_constructor);
4537 }
4538
4539 gcc_assert (rstmt);
4540
4541 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4542 gsi_replace (&rgsi, rstmt, true);
4543 }
4544
4545 /* Generate vector code for all SLP instances in the loop/basic block. */
4546
4547 void
4548 vect_schedule_slp (vec_info *vinfo)
4549 {
4550 vec<slp_instance> slp_instances;
4551 slp_instance instance;
4552 unsigned int i;
4553
4554 slp_instances = vinfo->slp_instances;
4555 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4556 {
4557 slp_tree node = SLP_INSTANCE_TREE (instance);
4558 /* Schedule the tree of INSTANCE. */
4559 vect_schedule_slp_instance (vinfo, node, instance);
4560
4561 if (SLP_INSTANCE_ROOT_STMT (instance))
4562 vectorize_slp_instance_root_stmt (node, instance);
4563
4564 if (dump_enabled_p ())
4565 dump_printf_loc (MSG_NOTE, vect_location,
4566 "vectorizing stmts using SLP.\n");
4567 }
4568
4569 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4570 {
4571 slp_tree root = SLP_INSTANCE_TREE (instance);
4572 stmt_vec_info store_info;
4573 unsigned int j;
4574
4575 /* Remove scalar call stmts. Do not do this for basic-block
4576 vectorization as not all uses may be vectorized.
4577 ??? Why should this be necessary? DCE should be able to
4578 remove the stmts itself.
4579 ??? For BB vectorization we can as well remove scalar
4580 stmts starting from the SLP tree root if they have no
4581 uses. */
4582 if (is_a <loop_vec_info> (vinfo))
4583 vect_remove_slp_scalar_calls (vinfo, root);
4584
4585 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4586 {
4587 if (!STMT_VINFO_DATA_REF (store_info))
4588 break;
4589
4590 if (SLP_INSTANCE_ROOT_STMT (instance))
4591 continue;
4592
4593 store_info = vect_orig_stmt (store_info);
4594 /* Free the attached stmt_vec_info and remove the stmt. */
4595 vinfo->remove_stmt (store_info);
4596 }
4597 }
4598 }