tree-optimization/95770 - fix SLP vectorized stmt placement compute
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
52
53 /* Initialize a SLP node. */
54
55 _slp_tree::_slp_tree ()
56 {
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
72 }
73
74 /* Tear down a SLP node. */
75
76 _slp_tree::~_slp_tree ()
77 {
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
85 }
86
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
90
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
93 {
94 int i;
95 slp_tree child;
96
97 if (--node->refcnt != 0)
98 return;
99
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
102
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
108 {
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
111 {
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
114 }
115 }
116
117 delete node;
118 }
119
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
123
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
126 {
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 free (instance);
130 }
131
132
133 /* Create an SLP node for SCALAR_STMTS. */
134
135 static slp_tree
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
137 {
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
144
145 unsigned i;
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
149
150 return node;
151 }
152
153 /* Create an SLP node for OPS. */
154
155 static slp_tree
156 vect_create_new_slp_node (vec<tree> ops)
157 {
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
182
183
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
188 {
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
192
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
195 {
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
203 }
204
205 return oprnds_info;
206 }
207
208
209 /* Free operands info. */
210
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 {
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
222 }
223
224 oprnds_info.release ();
225 }
226
227
228 /* Return true if STMTS contains a pattern statement. */
229
230 static bool
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 {
233 stmt_vec_info stmt_info;
234 unsigned int i;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
237 return true;
238 return false;
239 }
240
241 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
242 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
243 of the chain. */
244
245 int
246 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
247 stmt_vec_info first_stmt_info)
248 {
249 stmt_vec_info next_stmt_info = first_stmt_info;
250 int result = 0;
251
252 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
253 return -1;
254
255 do
256 {
257 if (next_stmt_info == stmt_info)
258 return result;
259 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
260 if (next_stmt_info)
261 result += DR_GROUP_GAP (next_stmt_info);
262 }
263 while (next_stmt_info);
264
265 return -1;
266 }
267
268 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
269 using the method implemented by duplicate_and_interleave. Return true
270 if so, returning the number of intermediate vectors in *NVECTORS_OUT
271 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
272 (if nonnull). */
273
274 bool
275 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
276 tree elt_type, unsigned int *nvectors_out,
277 tree *vector_type_out,
278 tree *permutes)
279 {
280 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
281 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
282 return false;
283
284 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
285 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
286 unsigned int nvectors = 1;
287 for (;;)
288 {
289 scalar_int_mode int_mode;
290 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
291 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
292 {
293 /* Get the natural vector type for this SLP group size. */
294 tree int_type = build_nonstandard_integer_type
295 (GET_MODE_BITSIZE (int_mode), 1);
296 tree vector_type
297 = get_vectype_for_scalar_type (vinfo, int_type, count);
298 if (vector_type
299 && VECTOR_MODE_P (TYPE_MODE (vector_type))
300 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
301 GET_MODE_SIZE (base_vector_mode)))
302 {
303 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
304 together into elements of type INT_TYPE and using the result
305 to build NVECTORS vectors. */
306 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
307 vec_perm_builder sel1 (nelts, 2, 3);
308 vec_perm_builder sel2 (nelts, 2, 3);
309 poly_int64 half_nelts = exact_div (nelts, 2);
310 for (unsigned int i = 0; i < 3; ++i)
311 {
312 sel1.quick_push (i);
313 sel1.quick_push (i + nelts);
314 sel2.quick_push (half_nelts + i);
315 sel2.quick_push (half_nelts + i + nelts);
316 }
317 vec_perm_indices indices1 (sel1, 2, nelts);
318 vec_perm_indices indices2 (sel2, 2, nelts);
319 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
320 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
321 {
322 if (nvectors_out)
323 *nvectors_out = nvectors;
324 if (vector_type_out)
325 *vector_type_out = vector_type;
326 if (permutes)
327 {
328 permutes[0] = vect_gen_perm_mask_checked (vector_type,
329 indices1);
330 permutes[1] = vect_gen_perm_mask_checked (vector_type,
331 indices2);
332 }
333 return true;
334 }
335 }
336 }
337 if (!multiple_p (elt_bytes, 2, &elt_bytes))
338 return false;
339 nvectors *= 2;
340 }
341 }
342
343 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
344 they are of a valid type and that they match the defs of the first stmt of
345 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
346 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
347 indicates swap is required for cond_expr stmts. Specifically, *SWAP
348 is 1 if STMT is cond and operands of comparison need to be swapped;
349 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
350 If there is any operand swap in this function, *SWAP is set to non-zero
351 value.
352 If there was a fatal error return -1; if the error could be corrected by
353 swapping operands of father node of this one, return 1; if everything is
354 ok return 0. */
355 static int
356 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
357 vec<stmt_vec_info> stmts, unsigned stmt_num,
358 vec<slp_oprnd_info> *oprnds_info)
359 {
360 stmt_vec_info stmt_info = stmts[stmt_num];
361 tree oprnd;
362 unsigned int i, number_of_oprnds;
363 enum vect_def_type dt = vect_uninitialized_def;
364 slp_oprnd_info oprnd_info;
365 int first_op_idx = 1;
366 unsigned int commutative_op = -1U;
367 bool first_op_cond = false;
368 bool first = stmt_num == 0;
369
370 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
371 {
372 number_of_oprnds = gimple_call_num_args (stmt);
373 first_op_idx = 3;
374 if (gimple_call_internal_p (stmt))
375 {
376 internal_fn ifn = gimple_call_internal_fn (stmt);
377 commutative_op = first_commutative_argument (ifn);
378
379 /* Masked load, only look at mask. */
380 if (ifn == IFN_MASK_LOAD)
381 {
382 number_of_oprnds = 1;
383 /* Mask operand index. */
384 first_op_idx = 5;
385 }
386 }
387 }
388 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
389 {
390 enum tree_code code = gimple_assign_rhs_code (stmt);
391 number_of_oprnds = gimple_num_ops (stmt) - 1;
392 /* Swap can only be done for cond_expr if asked to, otherwise we
393 could result in different comparison code to the first stmt. */
394 if (code == COND_EXPR
395 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
396 {
397 first_op_cond = true;
398 number_of_oprnds++;
399 }
400 else
401 commutative_op = commutative_tree_code (code) ? 0U : -1U;
402 }
403 else
404 return -1;
405
406 bool swapped = (*swap != 0);
407 gcc_assert (!swapped || first_op_cond);
408 for (i = 0; i < number_of_oprnds; i++)
409 {
410 again:
411 if (first_op_cond)
412 {
413 /* Map indicating how operands of cond_expr should be swapped. */
414 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
415 int *map = maps[*swap];
416
417 if (i < 2)
418 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
419 first_op_idx), map[i]);
420 else
421 oprnd = gimple_op (stmt_info->stmt, map[i]);
422 }
423 else
424 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
425 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
426 oprnd = TREE_OPERAND (oprnd, 0);
427
428 oprnd_info = (*oprnds_info)[i];
429
430 stmt_vec_info def_stmt_info;
431 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
432 {
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "Build SLP failed: can't analyze def for %T\n",
436 oprnd);
437
438 return -1;
439 }
440
441 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
442 oprnd_info->any_pattern = true;
443
444 if (first)
445 {
446 /* For the swapping logic below force vect_reduction_def
447 for the reduction op in a SLP reduction group. */
448 if (!STMT_VINFO_DATA_REF (stmt_info)
449 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
450 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
451 && def_stmt_info)
452 dt = vect_reduction_def;
453 oprnd_info->first_dt = dt;
454 oprnd_info->first_op_type = TREE_TYPE (oprnd);
455 }
456 else
457 {
458 /* Not first stmt of the group, check that the def-stmt/s match
459 the def-stmt/s of the first stmt. Allow different definition
460 types for reduction chains: the first stmt must be a
461 vect_reduction_def (a phi node), and the rest
462 end in the reduction chain. */
463 tree type = TREE_TYPE (oprnd);
464 if ((oprnd_info->first_dt != dt
465 && !(oprnd_info->first_dt == vect_reduction_def
466 && !STMT_VINFO_DATA_REF (stmt_info)
467 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
468 && def_stmt_info
469 && !STMT_VINFO_DATA_REF (def_stmt_info)
470 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
471 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
472 && !((oprnd_info->first_dt == vect_external_def
473 || oprnd_info->first_dt == vect_constant_def)
474 && (dt == vect_external_def
475 || dt == vect_constant_def)))
476 || !types_compatible_p (oprnd_info->first_op_type, type)
477 || (!STMT_VINFO_DATA_REF (stmt_info)
478 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
479 && ((!def_stmt_info
480 || STMT_VINFO_DATA_REF (def_stmt_info)
481 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
482 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
483 != (oprnd_info->first_dt != vect_reduction_def))))
484 {
485 /* Try swapping operands if we got a mismatch. */
486 if (i == commutative_op && !swapped)
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "trying swapped operands\n");
491 swapped = true;
492 goto again;
493 }
494
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
497 "Build SLP failed: different types\n");
498
499 return 1;
500 }
501 if ((dt == vect_constant_def
502 || dt == vect_external_def)
503 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
504 && (TREE_CODE (type) == BOOLEAN_TYPE
505 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
506 type)))
507 {
508 if (dump_enabled_p ())
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510 "Build SLP failed: invalid type of def "
511 "for variable-length SLP %T\n", oprnd);
512 return -1;
513 }
514 }
515
516 /* Check the types of the definitions. */
517 switch (dt)
518 {
519 case vect_external_def:
520 /* Make sure to demote the overall operand to external. */
521 oprnd_info->first_dt = vect_external_def;
522 /* Fallthru. */
523 case vect_constant_def:
524 oprnd_info->def_stmts.quick_push (NULL);
525 oprnd_info->ops.quick_push (oprnd);
526 break;
527
528 case vect_internal_def:
529 case vect_reduction_def:
530 if (oprnd_info->first_dt == vect_reduction_def
531 && !STMT_VINFO_DATA_REF (stmt_info)
532 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
533 && !STMT_VINFO_DATA_REF (def_stmt_info)
534 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
535 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
536 {
537 /* For a SLP reduction chain we want to duplicate the
538 reduction to each of the chain members. That gets
539 us a sane SLP graph (still the stmts are not 100%
540 correct wrt the initial values). */
541 gcc_assert (!first);
542 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
543 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
544 break;
545 }
546 /* Fallthru. */
547 case vect_induction_def:
548 oprnd_info->def_stmts.quick_push (def_stmt_info);
549 oprnd_info->ops.quick_push (oprnd);
550 break;
551
552 default:
553 /* FORNOW: Not supported. */
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
556 "Build SLP failed: illegal type of def %T\n",
557 oprnd);
558
559 return -1;
560 }
561 }
562
563 /* Swap operands. */
564 if (swapped)
565 {
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "swapped operands to match def types in %G",
569 stmt_info->stmt);
570 }
571
572 *swap = swapped;
573 return 0;
574 }
575
576 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
577 Return true if we can, meaning that this choice doesn't conflict with
578 existing SLP nodes that use STMT_INFO. */
579
580 static bool
581 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
582 {
583 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
584 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
585 return true;
586
587 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
588 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
589 {
590 /* We maintain the invariant that if any statement in the group is
591 used, all other members of the group have the same vector type. */
592 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
593 stmt_vec_info member_info = first_info;
594 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
595 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
596 || is_pattern_stmt_p (member_info))
597 break;
598
599 if (!member_info)
600 {
601 for (member_info = first_info; member_info;
602 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
603 STMT_VINFO_VECTYPE (member_info) = vectype;
604 return true;
605 }
606 }
607 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
608 && !is_pattern_stmt_p (stmt_info))
609 {
610 STMT_VINFO_VECTYPE (stmt_info) = vectype;
611 return true;
612 }
613
614 if (dump_enabled_p ())
615 {
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "Build SLP failed: incompatible vector"
618 " types for: %G", stmt_info->stmt);
619 dump_printf_loc (MSG_NOTE, vect_location,
620 " old vector type: %T\n", old_vectype);
621 dump_printf_loc (MSG_NOTE, vect_location,
622 " new vector type: %T\n", vectype);
623 }
624 return false;
625 }
626
627 /* Return true if call statements CALL1 and CALL2 are similar enough
628 to be combined into the same SLP group. */
629
630 static bool
631 compatible_calls_p (gcall *call1, gcall *call2)
632 {
633 unsigned int nargs = gimple_call_num_args (call1);
634 if (nargs != gimple_call_num_args (call2))
635 return false;
636
637 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
638 return false;
639
640 if (gimple_call_internal_p (call1))
641 {
642 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
643 TREE_TYPE (gimple_call_lhs (call2))))
644 return false;
645 for (unsigned int i = 0; i < nargs; ++i)
646 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
647 TREE_TYPE (gimple_call_arg (call2, i))))
648 return false;
649 }
650 else
651 {
652 if (!operand_equal_p (gimple_call_fn (call1),
653 gimple_call_fn (call2), 0))
654 return false;
655
656 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
657 return false;
658 }
659 return true;
660 }
661
662 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
663 caller's attempt to find the vector type in STMT_INFO with the narrowest
664 element type. Return true if VECTYPE is nonnull and if it is valid
665 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
666 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
667 vect_build_slp_tree. */
668
669 static bool
670 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
671 unsigned int group_size,
672 tree vectype, poly_uint64 *max_nunits)
673 {
674 if (!vectype)
675 {
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678 "Build SLP failed: unsupported data-type in %G\n",
679 stmt_info->stmt);
680 /* Fatal mismatch. */
681 return false;
682 }
683
684 /* If populating the vector type requires unrolling then fail
685 before adjusting *max_nunits for basic-block vectorization. */
686 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
687 unsigned HOST_WIDE_INT const_nunits;
688 if (is_a <bb_vec_info> (vinfo)
689 && (!nunits.is_constant (&const_nunits)
690 || const_nunits > group_size))
691 {
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
694 "Build SLP failed: unrolling required "
695 "in basic block SLP\n");
696 /* Fatal mismatch. */
697 return false;
698 }
699
700 /* In case of multiple types we need to detect the smallest type. */
701 vect_update_max_nunits (max_nunits, vectype);
702 return true;
703 }
704
705 /* Verify if the scalar stmts STMTS are isomorphic, require data
706 permutation or are of unsupported types of operation. Return
707 true if they are, otherwise return false and indicate in *MATCHES
708 which stmts are not isomorphic to the first one. If MATCHES[0]
709 is false then this indicates the comparison could not be
710 carried out or the stmts will never be vectorized by SLP.
711
712 Note COND_EXPR is possibly isomorphic to another one after swapping its
713 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
714 the first stmt by swapping the two operands of comparison; set SWAP[i]
715 to 2 if stmt I is isormorphic to the first stmt by inverting the code
716 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
717 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
718
719 static bool
720 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
721 vec<stmt_vec_info> stmts, unsigned int group_size,
722 poly_uint64 *max_nunits, bool *matches,
723 bool *two_operators, tree *node_vectype)
724 {
725 unsigned int i;
726 stmt_vec_info first_stmt_info = stmts[0];
727 enum tree_code first_stmt_code = ERROR_MARK;
728 enum tree_code alt_stmt_code = ERROR_MARK;
729 enum tree_code rhs_code = ERROR_MARK;
730 enum tree_code first_cond_code = ERROR_MARK;
731 tree lhs;
732 bool need_same_oprnds = false;
733 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
734 optab optab;
735 int icode;
736 machine_mode optab_op2_mode;
737 machine_mode vec_mode;
738 stmt_vec_info first_load = NULL, prev_first_load = NULL;
739 bool load_p = false;
740
741 /* For every stmt in NODE find its def stmt/s. */
742 stmt_vec_info stmt_info;
743 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
744 {
745 gimple *stmt = stmt_info->stmt;
746 swap[i] = 0;
747 matches[i] = false;
748
749 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
751
752 /* Fail to vectorize statements marked as unvectorizable. */
753 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
754 {
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 "Build SLP failed: unvectorizable statement %G",
758 stmt);
759 /* Fatal mismatch. */
760 matches[0] = false;
761 return false;
762 }
763
764 lhs = gimple_get_lhs (stmt);
765 if (lhs == NULL_TREE)
766 {
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: not GIMPLE_ASSIGN nor "
770 "GIMPLE_CALL %G", stmt);
771 /* Fatal mismatch. */
772 matches[0] = false;
773 return false;
774 }
775
776 tree nunits_vectype;
777 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
778 &nunits_vectype, group_size)
779 || (nunits_vectype
780 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
781 nunits_vectype, max_nunits)))
782 {
783 /* Fatal mismatch. */
784 matches[0] = false;
785 return false;
786 }
787
788 gcc_assert (vectype);
789
790 if (is_a <bb_vec_info> (vinfo)
791 && !vect_update_shared_vectype (stmt_info, vectype))
792 continue;
793
794 gcall *call_stmt = dyn_cast <gcall *> (stmt);
795 if (call_stmt)
796 {
797 rhs_code = CALL_EXPR;
798
799 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
800 load_p = true;
801 else if ((gimple_call_internal_p (call_stmt)
802 && (!vectorizable_internal_fn_p
803 (gimple_call_internal_fn (call_stmt))))
804 || gimple_call_tail_p (call_stmt)
805 || gimple_call_noreturn_p (call_stmt)
806 || !gimple_call_nothrow_p (call_stmt)
807 || gimple_call_chain (call_stmt))
808 {
809 if (dump_enabled_p ())
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
811 "Build SLP failed: unsupported call type %G",
812 call_stmt);
813 /* Fatal mismatch. */
814 matches[0] = false;
815 return false;
816 }
817 }
818 else
819 {
820 rhs_code = gimple_assign_rhs_code (stmt);
821 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
822 }
823
824 /* Check the operation. */
825 if (i == 0)
826 {
827 *node_vectype = vectype;
828 first_stmt_code = rhs_code;
829
830 /* Shift arguments should be equal in all the packed stmts for a
831 vector shift with scalar shift operand. */
832 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
833 || rhs_code == LROTATE_EXPR
834 || rhs_code == RROTATE_EXPR)
835 {
836 vec_mode = TYPE_MODE (vectype);
837
838 /* First see if we have a vector/vector shift. */
839 optab = optab_for_tree_code (rhs_code, vectype,
840 optab_vector);
841
842 if (!optab
843 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
844 {
845 /* No vector/vector shift, try for a vector/scalar shift. */
846 optab = optab_for_tree_code (rhs_code, vectype,
847 optab_scalar);
848
849 if (!optab)
850 {
851 if (dump_enabled_p ())
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
853 "Build SLP failed: no optab.\n");
854 /* Fatal mismatch. */
855 matches[0] = false;
856 return false;
857 }
858 icode = (int) optab_handler (optab, vec_mode);
859 if (icode == CODE_FOR_nothing)
860 {
861 if (dump_enabled_p ())
862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
863 "Build SLP failed: "
864 "op not supported by target.\n");
865 /* Fatal mismatch. */
866 matches[0] = false;
867 return false;
868 }
869 optab_op2_mode = insn_data[icode].operand[2].mode;
870 if (!VECTOR_MODE_P (optab_op2_mode))
871 {
872 need_same_oprnds = true;
873 first_op1 = gimple_assign_rhs2 (stmt);
874 }
875 }
876 }
877 else if (rhs_code == WIDEN_LSHIFT_EXPR)
878 {
879 need_same_oprnds = true;
880 first_op1 = gimple_assign_rhs2 (stmt);
881 }
882 else if (call_stmt
883 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
884 {
885 need_same_oprnds = true;
886 first_op1 = gimple_call_arg (call_stmt, 1);
887 }
888 }
889 else
890 {
891 if (first_stmt_code != rhs_code
892 && alt_stmt_code == ERROR_MARK)
893 alt_stmt_code = rhs_code;
894 if (first_stmt_code != rhs_code
895 && (first_stmt_code != IMAGPART_EXPR
896 || rhs_code != REALPART_EXPR)
897 && (first_stmt_code != REALPART_EXPR
898 || rhs_code != IMAGPART_EXPR)
899 /* Handle mismatches in plus/minus by computing both
900 and merging the results. */
901 && !((first_stmt_code == PLUS_EXPR
902 || first_stmt_code == MINUS_EXPR)
903 && (alt_stmt_code == PLUS_EXPR
904 || alt_stmt_code == MINUS_EXPR)
905 && rhs_code == alt_stmt_code)
906 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
907 && (first_stmt_code == ARRAY_REF
908 || first_stmt_code == BIT_FIELD_REF
909 || first_stmt_code == INDIRECT_REF
910 || first_stmt_code == COMPONENT_REF
911 || first_stmt_code == MEM_REF)))
912 {
913 if (dump_enabled_p ())
914 {
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: different operation "
917 "in stmt %G", stmt);
918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
919 "original stmt %G", first_stmt_info->stmt);
920 }
921 /* Mismatch. */
922 continue;
923 }
924
925 if (need_same_oprnds)
926 {
927 tree other_op1 = (call_stmt
928 ? gimple_call_arg (call_stmt, 1)
929 : gimple_assign_rhs2 (stmt));
930 if (!operand_equal_p (first_op1, other_op1, 0))
931 {
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
934 "Build SLP failed: different shift "
935 "arguments in %G", stmt);
936 /* Mismatch. */
937 continue;
938 }
939 }
940
941 if (!load_p && rhs_code == CALL_EXPR)
942 {
943 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
944 as_a <gcall *> (stmt)))
945 {
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "Build SLP failed: different calls in %G",
949 stmt);
950 /* Mismatch. */
951 continue;
952 }
953 }
954 }
955
956 /* Grouped store or load. */
957 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
958 {
959 if (REFERENCE_CLASS_P (lhs))
960 {
961 /* Store. */
962 ;
963 }
964 else
965 {
966 /* Load. */
967 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
968 if (prev_first_load)
969 {
970 /* Check that there are no loads from different interleaving
971 chains in the same node. */
972 if (prev_first_load != first_load)
973 {
974 if (dump_enabled_p ())
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
976 vect_location,
977 "Build SLP failed: different "
978 "interleaving chains in one node %G",
979 stmt);
980 /* Mismatch. */
981 continue;
982 }
983 }
984 else
985 prev_first_load = first_load;
986 }
987 } /* Grouped access. */
988 else
989 {
990 if (load_p)
991 {
992 /* Not grouped load. */
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: not grouped load %G", stmt);
996
997 /* FORNOW: Not grouped loads are not supported. */
998 /* Fatal mismatch. */
999 matches[0] = false;
1000 return false;
1001 }
1002
1003 /* Not memory operation. */
1004 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1005 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1006 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1007 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1008 && rhs_code != VIEW_CONVERT_EXPR
1009 && rhs_code != CALL_EXPR)
1010 {
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: operation unsupported %G",
1014 stmt);
1015 /* Fatal mismatch. */
1016 matches[0] = false;
1017 return false;
1018 }
1019
1020 if (rhs_code == COND_EXPR)
1021 {
1022 tree cond_expr = gimple_assign_rhs1 (stmt);
1023 enum tree_code cond_code = TREE_CODE (cond_expr);
1024 enum tree_code swap_code = ERROR_MARK;
1025 enum tree_code invert_code = ERROR_MARK;
1026
1027 if (i == 0)
1028 first_cond_code = TREE_CODE (cond_expr);
1029 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1030 {
1031 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1032 swap_code = swap_tree_comparison (cond_code);
1033 invert_code = invert_tree_comparison (cond_code, honor_nans);
1034 }
1035
1036 if (first_cond_code == cond_code)
1037 ;
1038 /* Isomorphic can be achieved by swapping. */
1039 else if (first_cond_code == swap_code)
1040 swap[i] = 1;
1041 /* Isomorphic can be achieved by inverting. */
1042 else if (first_cond_code == invert_code)
1043 swap[i] = 2;
1044 else
1045 {
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1048 "Build SLP failed: different"
1049 " operation %G", stmt);
1050 /* Mismatch. */
1051 continue;
1052 }
1053 }
1054 }
1055
1056 matches[i] = true;
1057 }
1058
1059 for (i = 0; i < group_size; ++i)
1060 if (!matches[i])
1061 return false;
1062
1063 /* If we allowed a two-operation SLP node verify the target can cope
1064 with the permute we are going to use. */
1065 if (alt_stmt_code != ERROR_MARK
1066 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1067 {
1068 *two_operators = true;
1069 }
1070
1071 return true;
1072 }
1073
1074 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1075 Note we never remove apart from at destruction time so we do not
1076 need a special value for deleted that differs from empty. */
1077 struct bst_traits
1078 {
1079 typedef vec <stmt_vec_info> value_type;
1080 typedef vec <stmt_vec_info> compare_type;
1081 static inline hashval_t hash (value_type);
1082 static inline bool equal (value_type existing, value_type candidate);
1083 static inline bool is_empty (value_type x) { return !x.exists (); }
1084 static inline bool is_deleted (value_type x) { return !x.exists (); }
1085 static const bool empty_zero_p = true;
1086 static inline void mark_empty (value_type &x) { x.release (); }
1087 static inline void mark_deleted (value_type &x) { x.release (); }
1088 static inline void remove (value_type &x) { x.release (); }
1089 };
1090 inline hashval_t
1091 bst_traits::hash (value_type x)
1092 {
1093 inchash::hash h;
1094 for (unsigned i = 0; i < x.length (); ++i)
1095 h.add_int (gimple_uid (x[i]->stmt));
1096 return h.end ();
1097 }
1098 inline bool
1099 bst_traits::equal (value_type existing, value_type candidate)
1100 {
1101 if (existing.length () != candidate.length ())
1102 return false;
1103 for (unsigned i = 0; i < existing.length (); ++i)
1104 if (existing[i] != candidate[i])
1105 return false;
1106 return true;
1107 }
1108
1109 typedef hash_map <vec <gimple *>, slp_tree,
1110 simple_hashmap_traits <bst_traits, slp_tree> >
1111 scalar_stmts_to_slp_tree_map_t;
1112
1113 static slp_tree
1114 vect_build_slp_tree_2 (vec_info *vinfo,
1115 vec<stmt_vec_info> stmts, unsigned int group_size,
1116 poly_uint64 *max_nunits,
1117 bool *matches, unsigned *npermutes, unsigned *tree_size,
1118 scalar_stmts_to_slp_tree_map_t *bst_map);
1119
1120 static slp_tree
1121 vect_build_slp_tree (vec_info *vinfo,
1122 vec<stmt_vec_info> stmts, unsigned int group_size,
1123 poly_uint64 *max_nunits,
1124 bool *matches, unsigned *npermutes, unsigned *tree_size,
1125 scalar_stmts_to_slp_tree_map_t *bst_map)
1126 {
1127 if (slp_tree *leader = bst_map->get (stmts))
1128 {
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1131 *leader ? "" : "failed ", *leader);
1132 if (*leader)
1133 {
1134 (*leader)->refcnt++;
1135 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1136 }
1137 return *leader;
1138 }
1139 poly_uint64 this_max_nunits = 1;
1140 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1141 &this_max_nunits,
1142 matches, npermutes, tree_size, bst_map);
1143 if (res)
1144 {
1145 res->max_nunits = this_max_nunits;
1146 vect_update_max_nunits (max_nunits, this_max_nunits);
1147 /* Keep a reference for the bst_map use. */
1148 res->refcnt++;
1149 }
1150 bst_map->put (stmts.copy (), res);
1151 return res;
1152 }
1153
1154 /* Recursively build an SLP tree starting from NODE.
1155 Fail (and return a value not equal to zero) if def-stmts are not
1156 isomorphic, require data permutation or are of unsupported types of
1157 operation. Otherwise, return 0.
1158 The value returned is the depth in the SLP tree where a mismatch
1159 was found. */
1160
1161 static slp_tree
1162 vect_build_slp_tree_2 (vec_info *vinfo,
1163 vec<stmt_vec_info> stmts, unsigned int group_size,
1164 poly_uint64 *max_nunits,
1165 bool *matches, unsigned *npermutes, unsigned *tree_size,
1166 scalar_stmts_to_slp_tree_map_t *bst_map)
1167 {
1168 unsigned nops, i, this_tree_size = 0;
1169 poly_uint64 this_max_nunits = *max_nunits;
1170 slp_tree node;
1171
1172 matches[0] = false;
1173
1174 stmt_vec_info stmt_info = stmts[0];
1175 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1176 nops = gimple_call_num_args (stmt);
1177 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1178 {
1179 nops = gimple_num_ops (stmt) - 1;
1180 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1181 nops++;
1182 }
1183 else if (is_a <gphi *> (stmt_info->stmt))
1184 nops = 0;
1185 else
1186 return NULL;
1187
1188 /* If the SLP node is a PHI (induction or reduction), terminate
1189 the recursion. */
1190 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1191 {
1192 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1193 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1194 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1195 max_nunits))
1196 return NULL;
1197
1198 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1199 /* Induction from different IVs is not supported. */
1200 if (def_type == vect_induction_def)
1201 {
1202 stmt_vec_info other_info;
1203 FOR_EACH_VEC_ELT (stmts, i, other_info)
1204 if (stmt_info != other_info)
1205 return NULL;
1206 }
1207 else if (def_type == vect_reduction_def
1208 || def_type == vect_double_reduction_def
1209 || def_type == vect_nested_cycle)
1210 {
1211 /* Else def types have to match. */
1212 stmt_vec_info other_info;
1213 FOR_EACH_VEC_ELT (stmts, i, other_info)
1214 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1215 return NULL;
1216 }
1217 else
1218 return NULL;
1219 (*tree_size)++;
1220 node = vect_create_new_slp_node (stmts, 0);
1221 SLP_TREE_VECTYPE (node) = vectype;
1222 return node;
1223 }
1224
1225
1226 bool two_operators = false;
1227 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1228 tree vectype = NULL_TREE;
1229 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1230 &this_max_nunits, matches, &two_operators,
1231 &vectype))
1232 return NULL;
1233
1234 /* If the SLP node is a load, terminate the recursion unless masked. */
1235 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1236 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1237 {
1238 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1239 {
1240 /* Masked load. */
1241 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1242 nops = 1;
1243 }
1244 else
1245 {
1246 *max_nunits = this_max_nunits;
1247 (*tree_size)++;
1248 node = vect_create_new_slp_node (stmts, 0);
1249 SLP_TREE_VECTYPE (node) = vectype;
1250 /* And compute the load permutation. Whether it is actually
1251 a permutation depends on the unrolling factor which is
1252 decided later. */
1253 vec<unsigned> load_permutation;
1254 int j;
1255 stmt_vec_info load_info;
1256 load_permutation.create (group_size);
1257 stmt_vec_info first_stmt_info
1258 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1259 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1260 {
1261 int load_place = vect_get_place_in_interleaving_chain
1262 (load_info, first_stmt_info);
1263 gcc_assert (load_place != -1);
1264 load_permutation.safe_push (load_place);
1265 }
1266 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1267 return node;
1268 }
1269 }
1270
1271 /* Get at the operands, verifying they are compatible. */
1272 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1273 slp_oprnd_info oprnd_info;
1274 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1275 {
1276 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1277 stmts, i, &oprnds_info);
1278 if (res != 0)
1279 matches[(res == -1) ? 0 : i] = false;
1280 if (!matches[0])
1281 break;
1282 }
1283 for (i = 0; i < group_size; ++i)
1284 if (!matches[i])
1285 {
1286 vect_free_oprnd_info (oprnds_info);
1287 return NULL;
1288 }
1289
1290 auto_vec<slp_tree, 4> children;
1291
1292 stmt_info = stmts[0];
1293
1294 /* Create SLP_TREE nodes for the definition node/s. */
1295 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1296 {
1297 slp_tree child;
1298 unsigned int j;
1299
1300 if (oprnd_info->first_dt == vect_uninitialized_def)
1301 {
1302 /* COND_EXPR have one too many eventually if the condition
1303 is a SSA name. */
1304 gcc_assert (i == 3 && nops == 4);
1305 continue;
1306 }
1307
1308 if (oprnd_info->first_dt != vect_internal_def
1309 && oprnd_info->first_dt != vect_reduction_def
1310 && oprnd_info->first_dt != vect_induction_def)
1311 {
1312 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1313 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1314 oprnd_info->ops = vNULL;
1315 children.safe_push (invnode);
1316 continue;
1317 }
1318
1319 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1320 group_size, &this_max_nunits,
1321 matches, npermutes,
1322 &this_tree_size, bst_map)) != NULL)
1323 {
1324 oprnd_info->def_stmts = vNULL;
1325 children.safe_push (child);
1326 continue;
1327 }
1328
1329 /* If the SLP build failed fatally and we analyze a basic-block
1330 simply treat nodes we fail to build as externally defined
1331 (and thus build vectors from the scalar defs).
1332 The cost model will reject outright expensive cases.
1333 ??? This doesn't treat cases where permutation ultimatively
1334 fails (or we don't try permutation below). Ideally we'd
1335 even compute a permutation that will end up with the maximum
1336 SLP tree size... */
1337 if (is_a <bb_vec_info> (vinfo)
1338 && !matches[0]
1339 /* ??? Rejecting patterns this way doesn't work. We'd have to
1340 do extra work to cancel the pattern so the uses see the
1341 scalar version. */
1342 && !is_pattern_stmt_p (stmt_info)
1343 && !oprnd_info->any_pattern)
1344 {
1345 if (dump_enabled_p ())
1346 dump_printf_loc (MSG_NOTE, vect_location,
1347 "Building vector operands from scalars\n");
1348 this_tree_size++;
1349 child = vect_create_new_slp_node (oprnd_info->ops);
1350 children.safe_push (child);
1351 oprnd_info->ops = vNULL;
1352 oprnd_info->def_stmts = vNULL;
1353 continue;
1354 }
1355
1356 /* If the SLP build for operand zero failed and operand zero
1357 and one can be commutated try that for the scalar stmts
1358 that failed the match. */
1359 if (i == 0
1360 /* A first scalar stmt mismatch signals a fatal mismatch. */
1361 && matches[0]
1362 /* ??? For COND_EXPRs we can swap the comparison operands
1363 as well as the arms under some constraints. */
1364 && nops == 2
1365 && oprnds_info[1]->first_dt == vect_internal_def
1366 && is_gimple_assign (stmt_info->stmt)
1367 /* Swapping operands for reductions breaks assumptions later on. */
1368 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1369 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1370 /* Do so only if the number of not successful permutes was nor more
1371 than a cut-ff as re-trying the recursive match on
1372 possibly each level of the tree would expose exponential
1373 behavior. */
1374 && *npermutes < 4)
1375 {
1376 /* See whether we can swap the matching or the non-matching
1377 stmt operands. */
1378 bool swap_not_matching = true;
1379 do
1380 {
1381 for (j = 0; j < group_size; ++j)
1382 {
1383 if (matches[j] != !swap_not_matching)
1384 continue;
1385 stmt_vec_info stmt_info = stmts[j];
1386 /* Verify if we can swap operands of this stmt. */
1387 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1388 if (!stmt
1389 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1390 {
1391 if (!swap_not_matching)
1392 goto fail;
1393 swap_not_matching = false;
1394 break;
1395 }
1396 }
1397 }
1398 while (j != group_size);
1399
1400 /* Swap mismatched definition stmts. */
1401 if (dump_enabled_p ())
1402 dump_printf_loc (MSG_NOTE, vect_location,
1403 "Re-trying with swapped operands of stmts ");
1404 for (j = 0; j < group_size; ++j)
1405 if (matches[j] == !swap_not_matching)
1406 {
1407 std::swap (oprnds_info[0]->def_stmts[j],
1408 oprnds_info[1]->def_stmts[j]);
1409 std::swap (oprnds_info[0]->ops[j],
1410 oprnds_info[1]->ops[j]);
1411 if (dump_enabled_p ())
1412 dump_printf (MSG_NOTE, "%d ", j);
1413 }
1414 if (dump_enabled_p ())
1415 dump_printf (MSG_NOTE, "\n");
1416 /* And try again with scratch 'matches' ... */
1417 bool *tem = XALLOCAVEC (bool, group_size);
1418 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1419 group_size, &this_max_nunits,
1420 tem, npermutes,
1421 &this_tree_size, bst_map)) != NULL)
1422 {
1423 oprnd_info->def_stmts = vNULL;
1424 children.safe_push (child);
1425 continue;
1426 }
1427
1428 ++*npermutes;
1429 }
1430
1431 fail:
1432 gcc_assert (child == NULL);
1433 FOR_EACH_VEC_ELT (children, j, child)
1434 vect_free_slp_tree (child, false);
1435 vect_free_oprnd_info (oprnds_info);
1436 return NULL;
1437 }
1438
1439 vect_free_oprnd_info (oprnds_info);
1440
1441 /* If we have all children of a non-unary child built up from
1442 scalars then just throw that away, causing it built up
1443 from scalars. */
1444 if (nops > 1
1445 && is_a <bb_vec_info> (vinfo)
1446 /* ??? Rejecting patterns this way doesn't work. We'd have to
1447 do extra work to cancel the pattern so the uses see the
1448 scalar version. */
1449 && !is_pattern_stmt_p (stmt_info))
1450 {
1451 slp_tree child;
1452 unsigned j;
1453 FOR_EACH_VEC_ELT (children, j, child)
1454 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
1455 break;
1456 if (!child)
1457 {
1458 /* Roll back. */
1459 FOR_EACH_VEC_ELT (children, j, child)
1460 vect_free_slp_tree (child, false);
1461
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_NOTE, vect_location,
1464 "Building parent vector operands from "
1465 "scalars instead\n");
1466 return NULL;
1467 }
1468 }
1469
1470 *tree_size += this_tree_size + 1;
1471 *max_nunits = this_max_nunits;
1472
1473 if (two_operators)
1474 {
1475 /* ??? We'd likely want to either cache in bst_map sth like
1476 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1477 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1478 explicit stmts to put in so the keying on 'stmts' doesn't
1479 work (but we have the same issue with nodes that use 'ops'). */
1480 slp_tree one = new _slp_tree;
1481 slp_tree two = new _slp_tree;
1482 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1483 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1484 SLP_TREE_VECTYPE (one) = vectype;
1485 SLP_TREE_VECTYPE (two) = vectype;
1486 SLP_TREE_CHILDREN (one).safe_splice (children);
1487 SLP_TREE_CHILDREN (two).safe_splice (children);
1488 slp_tree child;
1489 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1490 child->refcnt++;
1491
1492 /* Here we record the original defs since this
1493 node represents the final lane configuration. */
1494 node = vect_create_new_slp_node (stmts, 2);
1495 SLP_TREE_VECTYPE (node) = vectype;
1496 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1497 SLP_TREE_CHILDREN (node).quick_push (one);
1498 SLP_TREE_CHILDREN (node).quick_push (two);
1499 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1500 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1501 enum tree_code ocode = ERROR_MARK;
1502 stmt_vec_info ostmt_info;
1503 unsigned j = 0;
1504 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1505 {
1506 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1507 if (gimple_assign_rhs_code (ostmt) != code0)
1508 {
1509 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1510 ocode = gimple_assign_rhs_code (ostmt);
1511 j = i;
1512 }
1513 else
1514 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1515 }
1516 SLP_TREE_CODE (one) = code0;
1517 SLP_TREE_CODE (two) = ocode;
1518 SLP_TREE_LANES (one) = stmts.length ();
1519 SLP_TREE_LANES (two) = stmts.length ();
1520 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1521 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1522 return node;
1523 }
1524
1525 node = vect_create_new_slp_node (stmts, nops);
1526 SLP_TREE_VECTYPE (node) = vectype;
1527 SLP_TREE_CHILDREN (node).splice (children);
1528 return node;
1529 }
1530
1531 /* Dump a single SLP tree NODE. */
1532
1533 static void
1534 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1535 slp_tree node)
1536 {
1537 unsigned i, j;
1538 slp_tree child;
1539 stmt_vec_info stmt_info;
1540 tree op;
1541
1542 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1543 dump_user_location_t user_loc = loc.get_user_location ();
1544 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1545 SLP_TREE_DEF_TYPE (node) == vect_external_def
1546 ? " (external)"
1547 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1548 ? " (constant)"
1549 : ""), node,
1550 estimated_poly_value (node->max_nunits), node->refcnt);
1551 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1552 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1553 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1554 else
1555 {
1556 dump_printf_loc (metadata, user_loc, "\t{ ");
1557 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1558 dump_printf (metadata, "%T%s ", op,
1559 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1560 dump_printf (metadata, "}\n");
1561 }
1562 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1563 {
1564 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1565 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1566 dump_printf (dump_kind, " %u", j);
1567 dump_printf (dump_kind, " }\n");
1568 }
1569 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1570 {
1571 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1572 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1573 dump_printf (dump_kind, " %u[%u]",
1574 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1575 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1576 dump_printf (dump_kind, " }\n");
1577 }
1578 if (SLP_TREE_CHILDREN (node).is_empty ())
1579 return;
1580 dump_printf_loc (metadata, user_loc, "\tchildren");
1581 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1582 dump_printf (dump_kind, " %p", (void *)child);
1583 dump_printf (dump_kind, "\n");
1584 }
1585
1586 DEBUG_FUNCTION void
1587 debug (slp_tree node)
1588 {
1589 debug_dump_context ctx;
1590 vect_print_slp_tree (MSG_NOTE,
1591 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1592 node);
1593 }
1594
1595 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1596
1597 static void
1598 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1599 slp_tree node, hash_set<slp_tree> &visited)
1600 {
1601 unsigned i;
1602 slp_tree child;
1603
1604 if (visited.add (node))
1605 return;
1606
1607 vect_print_slp_tree (dump_kind, loc, node);
1608
1609 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1610 vect_print_slp_graph (dump_kind, loc, child, visited);
1611 }
1612
1613 static void
1614 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1615 slp_tree entry)
1616 {
1617 hash_set<slp_tree> visited;
1618 vect_print_slp_graph (dump_kind, loc, entry, visited);
1619 }
1620
1621 /* Mark the tree rooted at NODE with PURE_SLP. */
1622
1623 static void
1624 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1625 {
1626 int i;
1627 stmt_vec_info stmt_info;
1628 slp_tree child;
1629
1630 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1631 return;
1632
1633 if (visited.add (node))
1634 return;
1635
1636 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1637 STMT_SLP_TYPE (stmt_info) = pure_slp;
1638
1639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1640 vect_mark_slp_stmts (child, visited);
1641 }
1642
1643 static void
1644 vect_mark_slp_stmts (slp_tree node)
1645 {
1646 hash_set<slp_tree> visited;
1647 vect_mark_slp_stmts (node, visited);
1648 }
1649
1650 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1651
1652 static void
1653 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1654 {
1655 int i;
1656 stmt_vec_info stmt_info;
1657 slp_tree child;
1658
1659 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1660 return;
1661
1662 if (visited.add (node))
1663 return;
1664
1665 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1666 {
1667 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1668 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1669 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1670 }
1671
1672 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1673 vect_mark_slp_stmts_relevant (child, visited);
1674 }
1675
1676 static void
1677 vect_mark_slp_stmts_relevant (slp_tree node)
1678 {
1679 hash_set<slp_tree> visited;
1680 vect_mark_slp_stmts_relevant (node, visited);
1681 }
1682
1683 /* Copy the SLP subtree rooted at NODE. */
1684
1685 static slp_tree
1686 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1687 {
1688 unsigned i;
1689
1690 bool existed_p;
1691 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1692 if (existed_p)
1693 return copy_ref;
1694
1695 copy_ref = new _slp_tree;
1696 slp_tree copy = copy_ref;
1697 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1698 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1699 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1700 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1701 copy->max_nunits = node->max_nunits;
1702 copy->refcnt = 0;
1703 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1704 {
1705 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1706 stmt_vec_info stmt_info;
1707 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1708 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1709 }
1710 if (SLP_TREE_SCALAR_OPS (node).exists ())
1711 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1712 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1713 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1714 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1715 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1716 if (SLP_TREE_CHILDREN (node).exists ())
1717 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1718 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1719
1720 slp_tree child;
1721 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1722 {
1723 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1724 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1725 }
1726 return copy;
1727 }
1728
1729 /* Rearrange the statements of NODE according to PERMUTATION. */
1730
1731 static void
1732 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1733 vec<unsigned> permutation,
1734 hash_set<slp_tree> &visited)
1735 {
1736 unsigned int i;
1737 slp_tree child;
1738
1739 if (visited.add (node))
1740 return;
1741
1742 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1743 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1744
1745 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1746 {
1747 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1748 vec<stmt_vec_info> tmp_stmts;
1749 tmp_stmts.create (group_size);
1750 tmp_stmts.quick_grow (group_size);
1751 stmt_vec_info stmt_info;
1752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1753 tmp_stmts[permutation[i]] = stmt_info;
1754 SLP_TREE_SCALAR_STMTS (node).release ();
1755 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1756 }
1757 if (SLP_TREE_SCALAR_OPS (node).exists ())
1758 {
1759 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1760 vec<tree> tmp_ops;
1761 tmp_ops.create (group_size);
1762 tmp_ops.quick_grow (group_size);
1763 tree op;
1764 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1765 tmp_ops[permutation[i]] = op;
1766 SLP_TREE_SCALAR_OPS (node).release ();
1767 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1768 }
1769 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1770 {
1771 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1772 for (i = 0; i < group_size; ++i)
1773 SLP_TREE_LANE_PERMUTATION (node)[i].second
1774 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1775 }
1776 }
1777
1778
1779 /* Attempt to reorder stmts in a reduction chain so that we don't
1780 require any load permutation. Return true if that was possible,
1781 otherwise return false. */
1782
1783 static bool
1784 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1785 {
1786 unsigned int i, j;
1787 unsigned int lidx;
1788 slp_tree node, load;
1789
1790 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1791 return false;
1792
1793 /* Compare all the permutation sequences to the first one. We know
1794 that at least one load is permuted. */
1795 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1796 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1797 return false;
1798 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1799 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1800 {
1801 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1802 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1803 return false;
1804 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1805 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1806 return false;
1807 }
1808
1809 /* Check that the loads in the first sequence are different and there
1810 are no gaps between them. */
1811 auto_sbitmap load_index (group_size);
1812 bitmap_clear (load_index);
1813 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1814 {
1815 if (lidx >= group_size)
1816 return false;
1817 if (bitmap_bit_p (load_index, lidx))
1818 return false;
1819
1820 bitmap_set_bit (load_index, lidx);
1821 }
1822 for (i = 0; i < group_size; i++)
1823 if (!bitmap_bit_p (load_index, i))
1824 return false;
1825
1826 /* This permutation is valid for reduction. Since the order of the
1827 statements in the nodes is not important unless they are memory
1828 accesses, we can rearrange the statements in all the nodes
1829 according to the order of the loads. */
1830
1831 /* We have to unshare the SLP tree we modify. */
1832 hash_map<slp_tree, slp_tree> map;
1833 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1834 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1835 unshared->refcnt++;
1836 SLP_INSTANCE_TREE (slp_instn) = unshared;
1837 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1838 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1839 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1840
1841 /* Do the actual re-arrangement. */
1842 hash_set<slp_tree> visited;
1843 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1844 node->load_permutation, visited);
1845
1846 /* We are done, no actual permutations need to be generated. */
1847 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1848 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1849 {
1850 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1851 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1852 /* But we have to keep those permutations that are required because
1853 of handling of gaps. */
1854 if (known_eq (unrolling_factor, 1U)
1855 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1856 && DR_GROUP_GAP (first_stmt_info) == 0))
1857 SLP_TREE_LOAD_PERMUTATION (node).release ();
1858 else
1859 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1860 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1861 }
1862
1863 return true;
1864 }
1865
1866 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1867
1868 static void
1869 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1870 hash_set<slp_tree> &visited)
1871 {
1872 if (visited.add (node))
1873 return;
1874
1875 if (SLP_TREE_CHILDREN (node).length () == 0)
1876 {
1877 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1878 return;
1879 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1880 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1881 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1882 loads.safe_push (node);
1883 }
1884 else
1885 {
1886 unsigned i;
1887 slp_tree child;
1888 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1889 vect_gather_slp_loads (loads, child, visited);
1890 }
1891 }
1892
1893 static void
1894 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1895 {
1896 hash_set<slp_tree> visited;
1897 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1898 }
1899
1900
1901 /* Find the last store in SLP INSTANCE. */
1902
1903 stmt_vec_info
1904 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1905 {
1906 stmt_vec_info last = NULL;
1907 stmt_vec_info stmt_vinfo;
1908
1909 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1910 {
1911 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1912 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1913 }
1914
1915 return last;
1916 }
1917
1918 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1919 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1920 (also containing the first GROUP1_SIZE stmts, since stores are
1921 consecutive), the second containing the remainder.
1922 Return the first stmt in the second group. */
1923
1924 static stmt_vec_info
1925 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1926 {
1927 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1928 gcc_assert (group1_size > 0);
1929 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1930 gcc_assert (group2_size > 0);
1931 DR_GROUP_SIZE (first_vinfo) = group1_size;
1932
1933 stmt_vec_info stmt_info = first_vinfo;
1934 for (unsigned i = group1_size; i > 1; i--)
1935 {
1936 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1937 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1938 }
1939 /* STMT is now the last element of the first group. */
1940 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1941 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1942
1943 DR_GROUP_SIZE (group2) = group2_size;
1944 for (stmt_info = group2; stmt_info;
1945 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1946 {
1947 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1948 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1949 }
1950
1951 /* For the second group, the DR_GROUP_GAP is that before the original group,
1952 plus skipping over the first vector. */
1953 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1954
1955 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1956 DR_GROUP_GAP (first_vinfo) += group2_size;
1957
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1960 group1_size, group2_size);
1961
1962 return group2;
1963 }
1964
1965 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1966 statements and a vector of NUNITS elements. */
1967
1968 static poly_uint64
1969 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1970 {
1971 return exact_div (common_multiple (nunits, group_size), group_size);
1972 }
1973
1974 /* Analyze an SLP instance starting from a group of grouped stores. Call
1975 vect_build_slp_tree to build a tree of packed stmts if possible.
1976 Return FALSE if it's impossible to SLP any stmt in the loop. */
1977
1978 static bool
1979 vect_analyze_slp_instance (vec_info *vinfo,
1980 scalar_stmts_to_slp_tree_map_t *bst_map,
1981 stmt_vec_info stmt_info, unsigned max_tree_size)
1982 {
1983 slp_instance new_instance;
1984 slp_tree node;
1985 unsigned int group_size;
1986 tree vectype, scalar_type = NULL_TREE;
1987 unsigned int i;
1988 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1989 vec<stmt_vec_info> scalar_stmts;
1990 bool constructor = false;
1991
1992 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1993 {
1994 scalar_type = TREE_TYPE (DR_REF (dr));
1995 group_size = DR_GROUP_SIZE (stmt_info);
1996 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
1997 }
1998 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1999 {
2000 gcc_assert (is_a <loop_vec_info> (vinfo));
2001 vectype = STMT_VINFO_VECTYPE (stmt_info);
2002 group_size = REDUC_GROUP_SIZE (stmt_info);
2003 }
2004 else if (is_gimple_assign (stmt_info->stmt)
2005 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2006 {
2007 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2008 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2009 constructor = true;
2010 }
2011 else
2012 {
2013 gcc_assert (is_a <loop_vec_info> (vinfo));
2014 vectype = STMT_VINFO_VECTYPE (stmt_info);
2015 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2016 }
2017
2018 if (!vectype)
2019 {
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2022 "Build SLP failed: unsupported data-type %T\n",
2023 scalar_type);
2024
2025 return false;
2026 }
2027 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2028
2029 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2030 scalar_stmts.create (group_size);
2031 stmt_vec_info next_info = stmt_info;
2032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033 {
2034 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2035 while (next_info)
2036 {
2037 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2038 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2039 }
2040 }
2041 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2042 {
2043 /* Collect the reduction stmts and store them in
2044 SLP_TREE_SCALAR_STMTS. */
2045 while (next_info)
2046 {
2047 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2048 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2049 }
2050 /* Mark the first element of the reduction chain as reduction to properly
2051 transform the node. In the reduction analysis phase only the last
2052 element of the chain is marked as reduction. */
2053 STMT_VINFO_DEF_TYPE (stmt_info)
2054 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2055 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2056 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2057 }
2058 else if (constructor)
2059 {
2060 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2061 tree val;
2062 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2063 {
2064 if (TREE_CODE (val) == SSA_NAME)
2065 {
2066 gimple* def = SSA_NAME_DEF_STMT (val);
2067 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2068 /* Value is defined in another basic block. */
2069 if (!def_info)
2070 return false;
2071 def_info = vect_stmt_to_vectorize (def_info);
2072 scalar_stmts.safe_push (def_info);
2073 }
2074 else
2075 return false;
2076 }
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "Analyzing vectorizable constructor: %G\n",
2080 stmt_info->stmt);
2081 }
2082 else
2083 {
2084 /* Collect reduction statements. */
2085 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2086 for (i = 0; reductions.iterate (i, &next_info); i++)
2087 scalar_stmts.safe_push (next_info);
2088 }
2089
2090 /* Build the tree for the SLP instance. */
2091 bool *matches = XALLOCAVEC (bool, group_size);
2092 unsigned npermutes = 0;
2093 poly_uint64 max_nunits = nunits;
2094 unsigned tree_size = 0;
2095 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2096 &max_nunits, matches, &npermutes,
2097 &tree_size, bst_map);
2098 if (node != NULL)
2099 {
2100 /* Calculate the unrolling factor based on the smallest type. */
2101 poly_uint64 unrolling_factor
2102 = calculate_unrolling_factor (max_nunits, group_size);
2103
2104 if (maybe_ne (unrolling_factor, 1U)
2105 && is_a <bb_vec_info> (vinfo))
2106 {
2107 unsigned HOST_WIDE_INT const_max_nunits;
2108 if (!max_nunits.is_constant (&const_max_nunits)
2109 || const_max_nunits > group_size)
2110 {
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2113 "Build SLP failed: store group "
2114 "size not a multiple of the vector size "
2115 "in basic block SLP\n");
2116 vect_free_slp_tree (node, false);
2117 return false;
2118 }
2119 /* Fatal mismatch. */
2120 matches[group_size / const_max_nunits * const_max_nunits] = false;
2121 vect_free_slp_tree (node, false);
2122 }
2123 else
2124 {
2125 /* Create a new SLP instance. */
2126 new_instance = XNEW (class _slp_instance);
2127 SLP_INSTANCE_TREE (new_instance) = node;
2128 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2129 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2130 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2131
2132 vect_gather_slp_loads (new_instance, node);
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_NOTE, vect_location,
2135 "SLP size %u vs. limit %u.\n",
2136 tree_size, max_tree_size);
2137
2138 /* Check whether any load is possibly permuted. */
2139 slp_tree load_node;
2140 bool loads_permuted = false;
2141 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2142 {
2143 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2144 continue;
2145 unsigned j;
2146 stmt_vec_info load_info;
2147 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2148 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2149 {
2150 loads_permuted = true;
2151 break;
2152 }
2153 }
2154
2155 /* If the loads and stores can be handled with load/store-lane
2156 instructions do not generate this SLP instance. */
2157 if (is_a <loop_vec_info> (vinfo)
2158 && loads_permuted
2159 && dr && vect_store_lanes_supported (vectype, group_size, false))
2160 {
2161 slp_tree load_node;
2162 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2163 {
2164 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2165 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2166 /* Use SLP for strided accesses (or if we can't load-lanes). */
2167 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2168 || ! vect_load_lanes_supported
2169 (STMT_VINFO_VECTYPE (stmt_vinfo),
2170 DR_GROUP_SIZE (stmt_vinfo), false))
2171 break;
2172 }
2173 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2174 {
2175 if (dump_enabled_p ())
2176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2177 "Built SLP cancelled: can use "
2178 "load/store-lanes\n");
2179 vect_free_slp_instance (new_instance, false);
2180 return false;
2181 }
2182 }
2183
2184 /* If this is a reduction chain with a conversion in front
2185 amend the SLP tree with a node for that. */
2186 if (!dr
2187 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2188 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2189 {
2190 /* Get at the conversion stmt - we know it's the single use
2191 of the last stmt of the reduction chain. */
2192 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2193 use_operand_p use_p;
2194 gimple *use_stmt;
2195 bool r = single_imm_use (gimple_assign_lhs (tem),
2196 &use_p, &use_stmt);
2197 gcc_assert (r);
2198 next_info = vinfo->lookup_stmt (use_stmt);
2199 next_info = vect_stmt_to_vectorize (next_info);
2200 scalar_stmts = vNULL;
2201 scalar_stmts.create (group_size);
2202 for (unsigned i = 0; i < group_size; ++i)
2203 scalar_stmts.quick_push (next_info);
2204 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2205 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2206 SLP_TREE_CHILDREN (conv).quick_push (node);
2207 SLP_INSTANCE_TREE (new_instance) = conv;
2208 /* We also have to fake this conversion stmt as SLP reduction
2209 group so we don't have to mess with too much code
2210 elsewhere. */
2211 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2212 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2213 }
2214
2215 vinfo->slp_instances.safe_push (new_instance);
2216
2217 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2218 the number of scalar stmts in the root in a few places.
2219 Verify that assumption holds. */
2220 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2221 .length () == group_size);
2222
2223 if (dump_enabled_p ())
2224 {
2225 dump_printf_loc (MSG_NOTE, vect_location,
2226 "Final SLP tree for instance:\n");
2227 vect_print_slp_graph (MSG_NOTE, vect_location,
2228 SLP_INSTANCE_TREE (new_instance));
2229 }
2230
2231 return true;
2232 }
2233 }
2234 else
2235 {
2236 /* Failed to SLP. */
2237 /* Free the allocated memory. */
2238 scalar_stmts.release ();
2239 }
2240
2241 /* For basic block SLP, try to break the group up into multiples of the
2242 vector size. */
2243 unsigned HOST_WIDE_INT const_nunits;
2244 if (is_a <bb_vec_info> (vinfo)
2245 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2246 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2247 && nunits.is_constant (&const_nunits))
2248 {
2249 /* We consider breaking the group only on VF boundaries from the existing
2250 start. */
2251 for (i = 0; i < group_size; i++)
2252 if (!matches[i]) break;
2253
2254 if (i >= const_nunits && i < group_size)
2255 {
2256 /* Split into two groups at the first vector boundary before i. */
2257 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2258 unsigned group1_size = i & ~(const_nunits - 1);
2259
2260 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2261 group1_size);
2262 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2263 max_tree_size);
2264 /* If the first non-match was in the middle of a vector,
2265 skip the rest of that vector. */
2266 if (group1_size < i)
2267 {
2268 i = group1_size + const_nunits;
2269 if (i < group_size)
2270 rest = vect_split_slp_store_group (rest, const_nunits);
2271 }
2272 if (i < group_size)
2273 res |= vect_analyze_slp_instance (vinfo, bst_map,
2274 rest, max_tree_size);
2275 return res;
2276 }
2277 /* Even though the first vector did not all match, we might be able to SLP
2278 (some) of the remainder. FORNOW ignore this possibility. */
2279 }
2280
2281 return false;
2282 }
2283
2284
2285 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2286 trees of packed scalar stmts if SLP is possible. */
2287
2288 opt_result
2289 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2290 {
2291 unsigned int i;
2292 stmt_vec_info first_element;
2293
2294 DUMP_VECT_SCOPE ("vect_analyze_slp");
2295
2296 scalar_stmts_to_slp_tree_map_t *bst_map
2297 = new scalar_stmts_to_slp_tree_map_t ();
2298
2299 /* Find SLP sequences starting from groups of grouped stores. */
2300 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2301 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2302
2303 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2304 {
2305 if (loop_vinfo->reduction_chains.length () > 0)
2306 {
2307 /* Find SLP sequences starting from reduction chains. */
2308 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2309 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2310 max_tree_size))
2311 {
2312 /* Dissolve reduction chain group. */
2313 stmt_vec_info vinfo = first_element;
2314 stmt_vec_info last = NULL;
2315 while (vinfo)
2316 {
2317 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2318 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2319 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2320 last = vinfo;
2321 vinfo = next;
2322 }
2323 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2324 /* It can be still vectorized as part of an SLP reduction. */
2325 loop_vinfo->reductions.safe_push (last);
2326 }
2327 }
2328
2329 /* Find SLP sequences starting from groups of reductions. */
2330 if (loop_vinfo->reductions.length () > 1)
2331 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2332 max_tree_size);
2333 }
2334
2335 /* The map keeps a reference on SLP nodes built, release that. */
2336 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2337 it != bst_map->end (); ++it)
2338 if ((*it).second)
2339 vect_free_slp_tree ((*it).second, false);
2340 delete bst_map;
2341
2342 /* Optimize permutations in SLP reductions. */
2343 slp_instance instance;
2344 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2345 {
2346 slp_tree node = SLP_INSTANCE_TREE (instance);
2347 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2348 /* Reduction (there are no data-refs in the root).
2349 In reduction chain the order of the loads is not important. */
2350 if (!STMT_VINFO_DATA_REF (stmt_info)
2351 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2352 vect_attempt_slp_rearrange_stmts (instance);
2353 }
2354
2355 /* Gather all loads in the SLP graph. */
2356 hash_set<slp_tree> visited;
2357 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2358 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2359 visited);
2360
2361 return opt_result::success ();
2362 }
2363
2364 void
2365 vect_optimize_slp (vec_info *vinfo)
2366 {
2367 slp_tree node;
2368 unsigned i;
2369 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2370 {
2371 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2372 continue;
2373
2374 /* In basic block vectorization we allow any subchain of an interleaving
2375 chain.
2376 FORNOW: not in loop SLP because of realignment complications. */
2377 if (is_a <bb_vec_info> (vinfo))
2378 {
2379 bool subchain_p = true;
2380 stmt_vec_info next_load_info = NULL;
2381 stmt_vec_info load_info;
2382 unsigned j;
2383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2384 {
2385 if (j != 0
2386 && (next_load_info != load_info
2387 || DR_GROUP_GAP (load_info) != 1))
2388 {
2389 subchain_p = false;
2390 break;
2391 }
2392 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2393 }
2394 if (subchain_p)
2395 {
2396 SLP_TREE_LOAD_PERMUTATION (node).release ();
2397 continue;
2398 }
2399 }
2400 else
2401 {
2402 stmt_vec_info load_info;
2403 bool this_load_permuted = false;
2404 unsigned j;
2405 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2406 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2407 {
2408 this_load_permuted = true;
2409 break;
2410 }
2411 stmt_vec_info first_stmt_info
2412 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2413 if (!this_load_permuted
2414 /* The load requires permutation when unrolling exposes
2415 a gap either because the group is larger than the SLP
2416 group-size or because there is a gap between the groups. */
2417 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2418 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2419 && DR_GROUP_GAP (first_stmt_info) == 0)))
2420 {
2421 SLP_TREE_LOAD_PERMUTATION (node).release ();
2422 continue;
2423 }
2424 }
2425 }
2426 }
2427
2428
2429 /* For each possible SLP instance decide whether to SLP it and calculate overall
2430 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2431 least one instance. */
2432
2433 bool
2434 vect_make_slp_decision (loop_vec_info loop_vinfo)
2435 {
2436 unsigned int i;
2437 poly_uint64 unrolling_factor = 1;
2438 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2439 slp_instance instance;
2440 int decided_to_slp = 0;
2441
2442 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2443
2444 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2445 {
2446 /* FORNOW: SLP if you can. */
2447 /* All unroll factors have the form:
2448
2449 GET_MODE_SIZE (vinfo->vector_mode) * X
2450
2451 for some rational X, so they must have a common multiple. */
2452 unrolling_factor
2453 = force_common_multiple (unrolling_factor,
2454 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2455
2456 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2457 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2458 loop-based vectorization. Such stmts will be marked as HYBRID. */
2459 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2460 decided_to_slp++;
2461 }
2462
2463 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2464
2465 if (decided_to_slp && dump_enabled_p ())
2466 {
2467 dump_printf_loc (MSG_NOTE, vect_location,
2468 "Decided to SLP %d instances. Unrolling factor ",
2469 decided_to_slp);
2470 dump_dec (MSG_NOTE, unrolling_factor);
2471 dump_printf (MSG_NOTE, "\n");
2472 }
2473
2474 return (decided_to_slp > 0);
2475 }
2476
2477 /* Private data for vect_detect_hybrid_slp. */
2478 struct vdhs_data
2479 {
2480 loop_vec_info loop_vinfo;
2481 vec<stmt_vec_info> *worklist;
2482 };
2483
2484 /* Walker for walk_gimple_op. */
2485
2486 static tree
2487 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2488 {
2489 walk_stmt_info *wi = (walk_stmt_info *)data;
2490 vdhs_data *dat = (vdhs_data *)wi->info;
2491
2492 if (wi->is_lhs)
2493 return NULL_TREE;
2494
2495 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2496 if (!def_stmt_info)
2497 return NULL_TREE;
2498 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2499 if (PURE_SLP_STMT (def_stmt_info))
2500 {
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2503 def_stmt_info->stmt);
2504 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2505 dat->worklist->safe_push (def_stmt_info);
2506 }
2507
2508 return NULL_TREE;
2509 }
2510
2511 /* Find stmts that must be both vectorized and SLPed. */
2512
2513 void
2514 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2515 {
2516 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2517
2518 /* All stmts participating in SLP are marked pure_slp, all other
2519 stmts are loop_vect.
2520 First collect all loop_vect stmts into a worklist. */
2521 auto_vec<stmt_vec_info> worklist;
2522 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2523 {
2524 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2525 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2526 gsi_next (&gsi))
2527 {
2528 gphi *phi = gsi.phi ();
2529 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2530 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2531 worklist.safe_push (stmt_info);
2532 }
2533 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2534 gsi_next (&gsi))
2535 {
2536 gimple *stmt = gsi_stmt (gsi);
2537 if (is_gimple_debug (stmt))
2538 continue;
2539 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2540 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2541 {
2542 for (gimple_stmt_iterator gsi2
2543 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2544 !gsi_end_p (gsi2); gsi_next (&gsi2))
2545 {
2546 stmt_vec_info patt_info
2547 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2548 if (!STMT_SLP_TYPE (patt_info)
2549 && STMT_VINFO_RELEVANT (patt_info))
2550 worklist.safe_push (patt_info);
2551 }
2552 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2553 }
2554 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2555 worklist.safe_push (stmt_info);
2556 }
2557 }
2558
2559 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2560 mark any SLP vectorized stmt as hybrid.
2561 ??? We're visiting def stmts N times (once for each non-SLP and
2562 once for each hybrid-SLP use). */
2563 walk_stmt_info wi;
2564 vdhs_data dat;
2565 dat.worklist = &worklist;
2566 dat.loop_vinfo = loop_vinfo;
2567 memset (&wi, 0, sizeof (wi));
2568 wi.info = (void *)&dat;
2569 while (!worklist.is_empty ())
2570 {
2571 stmt_vec_info stmt_info = worklist.pop ();
2572 /* Since SSA operands are not set up for pattern stmts we need
2573 to use walk_gimple_op. */
2574 wi.is_lhs = 0;
2575 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2576 }
2577 }
2578
2579
2580 /* Initialize a bb_vec_info struct for the statements between
2581 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2582
2583 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2584 gimple_stmt_iterator region_end_in,
2585 vec_info_shared *shared)
2586 : vec_info (vec_info::bb, init_cost (NULL), shared),
2587 bb (gsi_bb (region_begin_in)),
2588 region_begin (region_begin_in),
2589 region_end (region_end_in)
2590 {
2591 for (gimple *stmt : this->region_stmts ())
2592 {
2593 gimple_set_uid (stmt, 0);
2594 if (is_gimple_debug (stmt))
2595 continue;
2596 add_stmt (stmt);
2597 }
2598
2599 bb->aux = this;
2600 }
2601
2602
2603 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2604 stmts in the basic block. */
2605
2606 _bb_vec_info::~_bb_vec_info ()
2607 {
2608 for (gimple *stmt : this->region_stmts ())
2609 /* Reset region marker. */
2610 gimple_set_uid (stmt, -1);
2611
2612 bb->aux = NULL;
2613 }
2614
2615 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2616 given then that child nodes have already been processed, and that
2617 their def types currently match their SLP node's def type. */
2618
2619 static bool
2620 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2621 slp_instance node_instance,
2622 stmt_vector_for_cost *cost_vec)
2623 {
2624 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2625 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2626
2627 /* Calculate the number of vector statements to be created for the
2628 scalar stmts in this node. For SLP reductions it is equal to the
2629 number of vector statements in the children (which has already been
2630 calculated by the recursive call). Otherwise it is the number of
2631 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2632 VF divided by the number of elements in a vector. */
2633 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2634 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2635 {
2636 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2637 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2638 {
2639 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2640 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2641 break;
2642 }
2643 }
2644 else
2645 {
2646 poly_uint64 vf;
2647 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2648 vf = loop_vinfo->vectorization_factor;
2649 else
2650 vf = 1;
2651 unsigned int group_size = SLP_TREE_LANES (node);
2652 tree vectype = SLP_TREE_VECTYPE (node);
2653 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2654 = vect_get_num_vectors (vf * group_size, vectype);
2655 }
2656
2657 /* Handle purely internal nodes. */
2658 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2659 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2660
2661 bool dummy;
2662 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2663 node, node_instance, cost_vec);
2664 }
2665
2666 /* Try to build NODE from scalars, returning true on success.
2667 NODE_INSTANCE is the SLP instance that contains NODE. */
2668
2669 static bool
2670 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2671 slp_instance node_instance)
2672 {
2673 stmt_vec_info stmt_info;
2674 unsigned int i;
2675
2676 if (!is_a <bb_vec_info> (vinfo)
2677 || node == SLP_INSTANCE_TREE (node_instance)
2678 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2679 return false;
2680
2681 if (dump_enabled_p ())
2682 dump_printf_loc (MSG_NOTE, vect_location,
2683 "Building vector operands from scalars instead\n");
2684
2685 /* Don't remove and free the child nodes here, since they could be
2686 referenced by other structures. The analysis and scheduling phases
2687 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2688 unsigned int group_size = SLP_TREE_LANES (node);
2689 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2690 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2691 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2692 {
2693 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2694 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2695 }
2696 return true;
2697 }
2698
2699 /* Compute the prologue cost for invariant or constant operands represented
2700 by NODE. */
2701
2702 static void
2703 vect_prologue_cost_for_slp (slp_tree node,
2704 stmt_vector_for_cost *cost_vec)
2705 {
2706 /* Without looking at the actual initializer a vector of
2707 constants can be implemented as load from the constant pool.
2708 When all elements are the same we can use a splat. */
2709 tree vectype = SLP_TREE_VECTYPE (node);
2710 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2711 unsigned num_vects_to_check;
2712 unsigned HOST_WIDE_INT const_nunits;
2713 unsigned nelt_limit;
2714 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2715 && ! multiple_p (const_nunits, group_size))
2716 {
2717 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2718 nelt_limit = const_nunits;
2719 }
2720 else
2721 {
2722 /* If either the vector has variable length or the vectors
2723 are composed of repeated whole groups we only need to
2724 cost construction once. All vectors will be the same. */
2725 num_vects_to_check = 1;
2726 nelt_limit = group_size;
2727 }
2728 tree elt = NULL_TREE;
2729 unsigned nelt = 0;
2730 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2731 {
2732 unsigned si = j % group_size;
2733 if (nelt == 0)
2734 elt = SLP_TREE_SCALAR_OPS (node)[si];
2735 /* ??? We're just tracking whether all operands of a single
2736 vector initializer are the same, ideally we'd check if
2737 we emitted the same one already. */
2738 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2739 elt = NULL_TREE;
2740 nelt++;
2741 if (nelt == nelt_limit)
2742 {
2743 record_stmt_cost (cost_vec, 1,
2744 SLP_TREE_DEF_TYPE (node) == vect_external_def
2745 ? (elt ? scalar_to_vec : vec_construct)
2746 : vector_load,
2747 NULL, vectype, 0, vect_prologue);
2748 nelt = 0;
2749 }
2750 }
2751 }
2752
2753 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2754 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2755
2756 Return true if the operations are supported. */
2757
2758 static bool
2759 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2760 slp_instance node_instance,
2761 hash_set<slp_tree> &visited,
2762 hash_set<slp_tree> &lvisited,
2763 stmt_vector_for_cost *cost_vec)
2764 {
2765 int i, j;
2766 slp_tree child;
2767
2768 /* Assume we can code-generate all invariants. */
2769 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2770 return true;
2771
2772 /* If we already analyzed the exact same set of scalar stmts we're done.
2773 We share the generated vector stmts for those.
2774 The SLP graph is acyclic so not caching whether we failed or succeeded
2775 doesn't result in any issue since we throw away the lvisited set
2776 when we fail. */
2777 if (visited.contains (node)
2778 || lvisited.add (node))
2779 return true;
2780
2781 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2782 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2783 visited, lvisited, cost_vec))
2784 return false;
2785
2786 bool res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2787 cost_vec);
2788
2789 /* When the node can be vectorized cost invariant nodes it references.
2790 This is not done in DFS order to allow the refering node
2791 vectorizable_* calls to nail down the invariant nodes vector type
2792 and possibly unshare it if it needs a different vector type than
2793 other referrers. */
2794 if (res)
2795 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2796 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2797 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2798 /* Perform usual caching, note code-generation still
2799 code-gens these nodes multiple times but we expect
2800 to CSE them later. */
2801 && !visited.contains (child)
2802 && !lvisited.add (child))
2803 {
2804 /* ??? After auditing more code paths make a "default"
2805 and push the vector type from NODE to all children
2806 if it is not already set. */
2807 /* Compute the number of vectors to be generated. */
2808 tree vector_type = SLP_TREE_VECTYPE (child);
2809 if (!vector_type)
2810 {
2811 /* For shifts with a scalar argument we don't need
2812 to cost or code-generate anything.
2813 ??? Represent this more explicitely. */
2814 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2815 == shift_vec_info_type)
2816 && j == 1);
2817 continue;
2818 }
2819 unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
2820 poly_uint64 vf = 1;
2821 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2822 vf = loop_vinfo->vectorization_factor;
2823 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2824 = vect_get_num_vectors (vf * group_size, vector_type);
2825 /* And cost them. */
2826 vect_prologue_cost_for_slp (child, cost_vec);
2827 }
2828
2829 /* If this node can't be vectorized, try pruning the tree here rather
2830 than felling the whole thing. */
2831 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2832 {
2833 /* We'll need to revisit this for invariant costing and number
2834 of vectorized stmt setting. */
2835 lvisited.remove (node);
2836 res = true;
2837 }
2838
2839 return res;
2840 }
2841
2842
2843 /* Analyze statements in SLP instances of VINFO. Return true if the
2844 operations are supported. */
2845
2846 bool
2847 vect_slp_analyze_operations (vec_info *vinfo)
2848 {
2849 slp_instance instance;
2850 int i;
2851
2852 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2853
2854 hash_set<slp_tree> visited;
2855 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2856 {
2857 hash_set<slp_tree> lvisited;
2858 stmt_vector_for_cost cost_vec;
2859 cost_vec.create (2);
2860 if (!vect_slp_analyze_node_operations (vinfo,
2861 SLP_INSTANCE_TREE (instance),
2862 instance, visited, lvisited,
2863 &cost_vec)
2864 /* Instances with a root stmt require vectorized defs for the
2865 SLP tree root. */
2866 || (SLP_INSTANCE_ROOT_STMT (instance)
2867 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2868 != vect_internal_def)))
2869 {
2870 slp_tree node = SLP_INSTANCE_TREE (instance);
2871 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_NOTE, vect_location,
2874 "removing SLP instance operations starting from: %G",
2875 stmt_info->stmt);
2876 vect_free_slp_instance (instance, false);
2877 vinfo->slp_instances.ordered_remove (i);
2878 cost_vec.release ();
2879 }
2880 else
2881 {
2882 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2883 x != lvisited.end(); ++x)
2884 visited.add (*x);
2885 i++;
2886
2887 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2888 cost_vec.release ();
2889 }
2890 }
2891
2892 return !vinfo->slp_instances.is_empty ();
2893 }
2894
2895
2896 /* Compute the scalar cost of the SLP node NODE and its children
2897 and return it. Do not account defs that are marked in LIFE and
2898 update LIFE according to uses of NODE. */
2899
2900 static void
2901 vect_bb_slp_scalar_cost (vec_info *vinfo,
2902 slp_tree node, vec<bool, va_heap> *life,
2903 stmt_vector_for_cost *cost_vec,
2904 hash_set<slp_tree> &visited)
2905 {
2906 unsigned i;
2907 stmt_vec_info stmt_info;
2908 slp_tree child;
2909
2910 if (visited.add (node))
2911 return;
2912
2913 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2914 {
2915 gimple *stmt = stmt_info->stmt;
2916 ssa_op_iter op_iter;
2917 def_operand_p def_p;
2918
2919 if ((*life)[i])
2920 continue;
2921
2922 /* If there is a non-vectorized use of the defs then the scalar
2923 stmt is kept live in which case we do not account it or any
2924 required defs in the SLP children in the scalar cost. This
2925 way we make the vectorization more costly when compared to
2926 the scalar cost. */
2927 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2928 {
2929 imm_use_iterator use_iter;
2930 gimple *use_stmt;
2931 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2932 if (!is_gimple_debug (use_stmt))
2933 {
2934 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2935 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2936 {
2937 (*life)[i] = true;
2938 BREAK_FROM_IMM_USE_STMT (use_iter);
2939 }
2940 }
2941 }
2942 if ((*life)[i])
2943 continue;
2944
2945 /* Count scalar stmts only once. */
2946 if (gimple_visited_p (stmt))
2947 continue;
2948 gimple_set_visited (stmt, true);
2949
2950 vect_cost_for_stmt kind;
2951 if (STMT_VINFO_DATA_REF (stmt_info))
2952 {
2953 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2954 kind = scalar_load;
2955 else
2956 kind = scalar_store;
2957 }
2958 else if (vect_nop_conversion_p (stmt_info))
2959 continue;
2960 else
2961 kind = scalar_stmt;
2962 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2963 }
2964
2965 auto_vec<bool, 20> subtree_life;
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2967 {
2968 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2969 {
2970 /* Do not directly pass LIFE to the recursive call, copy it to
2971 confine changes in the callee to the current child/subtree. */
2972 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2973 {
2974 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
2975 for (unsigned j = 0;
2976 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
2977 {
2978 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
2979 if (perm.first == i)
2980 subtree_life[perm.second] = (*life)[j];
2981 }
2982 }
2983 else
2984 {
2985 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
2986 subtree_life.safe_splice (*life);
2987 }
2988 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
2989 visited);
2990 subtree_life.truncate (0);
2991 }
2992 }
2993 }
2994
2995 /* Check if vectorization of the basic block is profitable. */
2996
2997 static bool
2998 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2999 {
3000 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3001 slp_instance instance;
3002 int i;
3003 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3004 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3005
3006 /* Calculate scalar cost. */
3007 stmt_vector_for_cost scalar_costs;
3008 scalar_costs.create (0);
3009 hash_set<slp_tree> visited;
3010 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3011 {
3012 auto_vec<bool, 20> life;
3013 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3014 vect_bb_slp_scalar_cost (bb_vinfo,
3015 SLP_INSTANCE_TREE (instance),
3016 &life, &scalar_costs, visited);
3017 }
3018 /* Unset visited flag. */
3019 stmt_info_for_cost *si;
3020 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3021 gimple_set_visited (si->stmt_info->stmt, false);
3022
3023 void *target_cost_data = init_cost (NULL);
3024 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3025 scalar_costs.release ();
3026 unsigned dummy;
3027 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3028 destroy_cost_data (target_cost_data);
3029
3030 /* Complete the target-specific cost calculation. */
3031 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3032 &vec_inside_cost, &vec_epilogue_cost);
3033
3034 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3035
3036 if (dump_enabled_p ())
3037 {
3038 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3039 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3040 vec_inside_cost);
3041 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3042 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3043 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3044 }
3045
3046 /* Vectorization is profitable if its cost is more than the cost of scalar
3047 version. Note that we err on the vector side for equal cost because
3048 the cost estimate is otherwise quite pessimistic (constant uses are
3049 free on the scalar side but cost a load on the vector side for
3050 example). */
3051 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3052 return false;
3053
3054 return true;
3055 }
3056
3057 /* Find any vectorizable constructors and add them to the grouped_store
3058 array. */
3059
3060 static void
3061 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3062 {
3063 for (gimple *stmt : bb_vinfo->region_stmts ())
3064 {
3065 gassign *assign = dyn_cast<gassign *> (stmt);
3066 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3067 continue;
3068
3069 tree rhs = gimple_assign_rhs1 (assign);
3070 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3071 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3072 CONSTRUCTOR_NELTS (rhs))
3073 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3074 || uniform_vector_p (rhs))
3075 continue;
3076
3077 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3078 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3079 }
3080 }
3081
3082 /* Check if the region described by BB_VINFO can be vectorized, returning
3083 true if so. When returning false, set FATAL to true if the same failure
3084 would prevent vectorization at other vector sizes, false if it is still
3085 worth trying other sizes. N_STMTS is the number of statements in the
3086 region. */
3087
3088 static bool
3089 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3090 {
3091 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3092
3093 slp_instance instance;
3094 int i;
3095 poly_uint64 min_vf = 2;
3096
3097 /* The first group of checks is independent of the vector size. */
3098 fatal = true;
3099
3100 /* Analyze the data references. */
3101
3102 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3103 {
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3106 "not vectorized: unhandled data-ref in basic "
3107 "block.\n");
3108 return false;
3109 }
3110
3111 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3112 {
3113 if (dump_enabled_p ())
3114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3115 "not vectorized: not enough data-refs in "
3116 "basic block.\n");
3117 return false;
3118 }
3119
3120 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3121 {
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3124 "not vectorized: unhandled data access in "
3125 "basic block.\n");
3126 return false;
3127 }
3128
3129 vect_slp_check_for_constructors (bb_vinfo);
3130
3131 /* If there are no grouped stores in the region there is no need
3132 to continue with pattern recog as vect_analyze_slp will fail
3133 anyway. */
3134 if (bb_vinfo->grouped_stores.is_empty ())
3135 {
3136 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3138 "not vectorized: no grouped stores in "
3139 "basic block.\n");
3140 return false;
3141 }
3142
3143 /* While the rest of the analysis below depends on it in some way. */
3144 fatal = false;
3145
3146 vect_pattern_recog (bb_vinfo);
3147
3148 /* Check the SLP opportunities in the basic block, analyze and build SLP
3149 trees. */
3150 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3151 {
3152 if (dump_enabled_p ())
3153 {
3154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3155 "Failed to SLP the basic block.\n");
3156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3157 "not vectorized: failed to find SLP opportunities "
3158 "in basic block.\n");
3159 }
3160 return false;
3161 }
3162
3163 /* Optimize permutations. */
3164 vect_optimize_slp (bb_vinfo);
3165
3166 vect_record_base_alignments (bb_vinfo);
3167
3168 /* Analyze and verify the alignment of data references and the
3169 dependence in the SLP instances. */
3170 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3171 {
3172 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3173 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3174 {
3175 slp_tree node = SLP_INSTANCE_TREE (instance);
3176 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE, vect_location,
3179 "removing SLP instance operations starting from: %G",
3180 stmt_info->stmt);
3181 vect_free_slp_instance (instance, false);
3182 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3183 continue;
3184 }
3185
3186 /* Mark all the statements that we want to vectorize as pure SLP and
3187 relevant. */
3188 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3189 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3190 if (SLP_INSTANCE_ROOT_STMT (instance))
3191 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3192
3193 i++;
3194 }
3195 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3196 return false;
3197
3198 if (!vect_slp_analyze_operations (bb_vinfo))
3199 {
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3202 "not vectorized: bad operation in basic block.\n");
3203 return false;
3204 }
3205
3206 /* Cost model: check if the vectorization is worthwhile. */
3207 if (!unlimited_cost_model (NULL)
3208 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3209 {
3210 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212 "not vectorized: vectorization is not "
3213 "profitable.\n");
3214 return false;
3215 }
3216
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE, vect_location,
3219 "Basic block will be vectorized using SLP\n");
3220 return true;
3221 }
3222
3223 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3224 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3225 on success. The region has N_STMTS statements and has the datarefs
3226 given by DATAREFS. */
3227
3228 static bool
3229 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3230 gimple_stmt_iterator region_end,
3231 vec<data_reference_p> datarefs,
3232 unsigned int n_stmts)
3233 {
3234 bb_vec_info bb_vinfo;
3235 auto_vector_modes vector_modes;
3236
3237 /* Autodetect first vector size we try. */
3238 machine_mode next_vector_mode = VOIDmode;
3239 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3240 unsigned int mode_i = 0;
3241
3242 vec_info_shared shared;
3243
3244 machine_mode autodetected_vector_mode = VOIDmode;
3245 while (1)
3246 {
3247 bool vectorized = false;
3248 bool fatal = false;
3249 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3250
3251 bool first_time_p = shared.datarefs.is_empty ();
3252 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3253 if (first_time_p)
3254 bb_vinfo->shared->save_datarefs ();
3255 else
3256 bb_vinfo->shared->check_datarefs ();
3257 bb_vinfo->vector_mode = next_vector_mode;
3258
3259 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3260 && dbg_cnt (vect_slp))
3261 {
3262 if (dump_enabled_p ())
3263 {
3264 dump_printf_loc (MSG_NOTE, vect_location,
3265 "***** Analysis succeeded with vector mode"
3266 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3267 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3268 }
3269
3270 bb_vinfo->shared->check_datarefs ();
3271 vect_schedule_slp (bb_vinfo);
3272
3273 unsigned HOST_WIDE_INT bytes;
3274 if (dump_enabled_p ())
3275 {
3276 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3277 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3278 "basic block part vectorized using %wu byte "
3279 "vectors\n", bytes);
3280 else
3281 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3282 "basic block part vectorized using variable "
3283 "length vectors\n");
3284 }
3285
3286 vectorized = true;
3287 }
3288 else
3289 {
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE, vect_location,
3292 "***** Analysis failed with vector mode %s\n",
3293 GET_MODE_NAME (bb_vinfo->vector_mode));
3294 }
3295
3296 if (mode_i == 0)
3297 autodetected_vector_mode = bb_vinfo->vector_mode;
3298
3299 if (!fatal)
3300 while (mode_i < vector_modes.length ()
3301 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3302 {
3303 if (dump_enabled_p ())
3304 dump_printf_loc (MSG_NOTE, vect_location,
3305 "***** The result for vector mode %s would"
3306 " be the same\n",
3307 GET_MODE_NAME (vector_modes[mode_i]));
3308 mode_i += 1;
3309 }
3310
3311 delete bb_vinfo;
3312
3313 if (mode_i < vector_modes.length ()
3314 && VECTOR_MODE_P (autodetected_vector_mode)
3315 && (related_vector_mode (vector_modes[mode_i],
3316 GET_MODE_INNER (autodetected_vector_mode))
3317 == autodetected_vector_mode)
3318 && (related_vector_mode (autodetected_vector_mode,
3319 GET_MODE_INNER (vector_modes[mode_i]))
3320 == vector_modes[mode_i]))
3321 {
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location,
3324 "***** Skipping vector mode %s, which would"
3325 " repeat the analysis for %s\n",
3326 GET_MODE_NAME (vector_modes[mode_i]),
3327 GET_MODE_NAME (autodetected_vector_mode));
3328 mode_i += 1;
3329 }
3330
3331 if (vectorized
3332 || mode_i == vector_modes.length ()
3333 || autodetected_vector_mode == VOIDmode
3334 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3335 vector sizes will fail do not bother iterating. */
3336 || fatal)
3337 return vectorized;
3338
3339 /* Try the next biggest vector size. */
3340 next_vector_mode = vector_modes[mode_i++];
3341 if (dump_enabled_p ())
3342 dump_printf_loc (MSG_NOTE, vect_location,
3343 "***** Re-trying analysis with vector mode %s\n",
3344 GET_MODE_NAME (next_vector_mode));
3345 }
3346 }
3347
3348 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3349 true if anything in the basic-block was vectorized. */
3350
3351 bool
3352 vect_slp_bb (basic_block bb)
3353 {
3354 gimple_stmt_iterator gsi;
3355 bool any_vectorized = false;
3356
3357 gsi = gsi_after_labels (bb);
3358 while (!gsi_end_p (gsi))
3359 {
3360 gimple_stmt_iterator region_begin = gsi;
3361 vec<data_reference_p> datarefs = vNULL;
3362 int insns = 0;
3363
3364 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3365 {
3366 gimple *stmt = gsi_stmt (gsi);
3367 if (is_gimple_debug (stmt))
3368 {
3369 /* Skip leading debug stmts. */
3370 if (gsi_stmt (region_begin) == stmt)
3371 gsi_next (&region_begin);
3372 continue;
3373 }
3374 insns++;
3375
3376 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3377 vect_location = stmt;
3378
3379 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3380 break;
3381 }
3382 if (gsi_end_p (region_begin))
3383 break;
3384
3385 /* Skip leading unhandled stmts. */
3386 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3387 {
3388 gsi_next (&gsi);
3389 continue;
3390 }
3391
3392 gimple_stmt_iterator region_end = gsi;
3393
3394 if (insns > param_slp_max_insns_in_bb)
3395 {
3396 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3398 "not vectorized: too many instructions in "
3399 "basic block.\n");
3400 }
3401 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3402 any_vectorized = true;
3403
3404 if (gsi_end_p (region_end))
3405 break;
3406
3407 /* Skip the unhandled stmt. */
3408 gsi_next (&gsi);
3409 }
3410
3411 return any_vectorized;
3412 }
3413
3414
3415 /* Build a variable-length vector in which the elements in ELTS are repeated
3416 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3417 RESULTS and add any new instructions to SEQ.
3418
3419 The approach we use is:
3420
3421 (1) Find a vector mode VM with integer elements of mode IM.
3422
3423 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3424 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3425 from small vectors to IM.
3426
3427 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3428
3429 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3430 correct byte contents.
3431
3432 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3433
3434 We try to find the largest IM for which this sequence works, in order
3435 to cut down on the number of interleaves. */
3436
3437 void
3438 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3439 vec<tree> elts, unsigned int nresults,
3440 vec<tree> &results)
3441 {
3442 unsigned int nelts = elts.length ();
3443 tree element_type = TREE_TYPE (vector_type);
3444
3445 /* (1) Find a vector mode VM with integer elements of mode IM. */
3446 unsigned int nvectors = 1;
3447 tree new_vector_type;
3448 tree permutes[2];
3449 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3450 &nvectors, &new_vector_type,
3451 permutes))
3452 gcc_unreachable ();
3453
3454 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3455 unsigned int partial_nelts = nelts / nvectors;
3456 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3457
3458 tree_vector_builder partial_elts;
3459 auto_vec<tree, 32> pieces (nvectors * 2);
3460 pieces.quick_grow (nvectors * 2);
3461 for (unsigned int i = 0; i < nvectors; ++i)
3462 {
3463 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3464 ELTS' has mode IM. */
3465 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3466 for (unsigned int j = 0; j < partial_nelts; ++j)
3467 partial_elts.quick_push (elts[i * partial_nelts + j]);
3468 tree t = gimple_build_vector (seq, &partial_elts);
3469 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3470 TREE_TYPE (new_vector_type), t);
3471
3472 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3473 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3474 }
3475
3476 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3477 correct byte contents.
3478
3479 We need to repeat the following operation log2(nvectors) times:
3480
3481 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3482 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3483
3484 However, if each input repeats every N elements and the VF is
3485 a multiple of N * 2, the HI result is the same as the LO. */
3486 unsigned int in_start = 0;
3487 unsigned int out_start = nvectors;
3488 unsigned int hi_start = nvectors / 2;
3489 /* A bound on the number of outputs needed to produce NRESULTS results
3490 in the final iteration. */
3491 unsigned int noutputs_bound = nvectors * nresults;
3492 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3493 {
3494 noutputs_bound /= 2;
3495 unsigned int limit = MIN (noutputs_bound, nvectors);
3496 for (unsigned int i = 0; i < limit; ++i)
3497 {
3498 if ((i & 1) != 0
3499 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3500 2 * in_repeat))
3501 {
3502 pieces[out_start + i] = pieces[out_start + i - 1];
3503 continue;
3504 }
3505
3506 tree output = make_ssa_name (new_vector_type);
3507 tree input1 = pieces[in_start + (i / 2)];
3508 tree input2 = pieces[in_start + (i / 2) + hi_start];
3509 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3510 input1, input2,
3511 permutes[i & 1]);
3512 gimple_seq_add_stmt (seq, stmt);
3513 pieces[out_start + i] = output;
3514 }
3515 std::swap (in_start, out_start);
3516 }
3517
3518 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3519 results.reserve (nresults);
3520 for (unsigned int i = 0; i < nresults; ++i)
3521 if (i < nvectors)
3522 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3523 pieces[in_start + i]));
3524 else
3525 results.quick_push (results[i - nvectors]);
3526 }
3527
3528
3529 /* For constant and loop invariant defs in OP_NODE this function creates
3530 vector defs that will be used in the vectorized stmts and stores them
3531 to SLP_TREE_VEC_DEFS of OP_NODE. */
3532
3533 static void
3534 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3535 {
3536 unsigned HOST_WIDE_INT nunits;
3537 tree vec_cst;
3538 unsigned j, number_of_places_left_in_vector;
3539 tree vector_type;
3540 tree vop;
3541 int group_size = op_node->ops.length ();
3542 unsigned int vec_num, i;
3543 unsigned number_of_copies = 1;
3544 bool constant_p;
3545 gimple_seq ctor_seq = NULL;
3546 auto_vec<tree, 16> permute_results;
3547
3548 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3549 vector_type = SLP_TREE_VECTYPE (op_node);
3550
3551 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3552 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3553 auto_vec<tree> voprnds (number_of_vectors);
3554
3555 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3556 created vectors. It is greater than 1 if unrolling is performed.
3557
3558 For example, we have two scalar operands, s1 and s2 (e.g., group of
3559 strided accesses of size two), while NUNITS is four (i.e., four scalars
3560 of this type can be packed in a vector). The output vector will contain
3561 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3562 will be 2).
3563
3564 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3565 containing the operands.
3566
3567 For example, NUNITS is four as before, and the group size is 8
3568 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3569 {s5, s6, s7, s8}. */
3570
3571 /* When using duplicate_and_interleave, we just need one element for
3572 each scalar statement. */
3573 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3574 nunits = group_size;
3575
3576 number_of_copies = nunits * number_of_vectors / group_size;
3577
3578 number_of_places_left_in_vector = nunits;
3579 constant_p = true;
3580 tree_vector_builder elts (vector_type, nunits, 1);
3581 elts.quick_grow (nunits);
3582 stmt_vec_info insert_after = NULL;
3583 for (j = 0; j < number_of_copies; j++)
3584 {
3585 tree op;
3586 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3587 {
3588 /* Create 'vect_ = {op0,op1,...,opn}'. */
3589 number_of_places_left_in_vector--;
3590 tree orig_op = op;
3591 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3592 {
3593 if (CONSTANT_CLASS_P (op))
3594 {
3595 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3596 {
3597 /* Can't use VIEW_CONVERT_EXPR for booleans because
3598 of possibly different sizes of scalar value and
3599 vector element. */
3600 if (integer_zerop (op))
3601 op = build_int_cst (TREE_TYPE (vector_type), 0);
3602 else if (integer_onep (op))
3603 op = build_all_ones_cst (TREE_TYPE (vector_type));
3604 else
3605 gcc_unreachable ();
3606 }
3607 else
3608 op = fold_unary (VIEW_CONVERT_EXPR,
3609 TREE_TYPE (vector_type), op);
3610 gcc_assert (op && CONSTANT_CLASS_P (op));
3611 }
3612 else
3613 {
3614 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3615 gimple *init_stmt;
3616 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3617 {
3618 tree true_val
3619 = build_all_ones_cst (TREE_TYPE (vector_type));
3620 tree false_val
3621 = build_zero_cst (TREE_TYPE (vector_type));
3622 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3623 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3624 op, true_val,
3625 false_val);
3626 }
3627 else
3628 {
3629 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3630 op);
3631 init_stmt
3632 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3633 op);
3634 }
3635 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3636 op = new_temp;
3637 }
3638 }
3639 elts[number_of_places_left_in_vector] = op;
3640 if (!CONSTANT_CLASS_P (op))
3641 constant_p = false;
3642 /* For BB vectorization we have to compute an insert location
3643 when a def is inside the analyzed region since we cannot
3644 simply insert at the BB start in this case. */
3645 stmt_vec_info opdef;
3646 if (TREE_CODE (orig_op) == SSA_NAME
3647 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3648 && is_a <bb_vec_info> (vinfo)
3649 && (opdef = vinfo->lookup_def (orig_op)))
3650 {
3651 if (!insert_after)
3652 insert_after = opdef;
3653 else
3654 insert_after = get_later_stmt (insert_after, opdef);
3655 }
3656
3657 if (number_of_places_left_in_vector == 0)
3658 {
3659 if (constant_p
3660 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3661 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3662 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3663 else
3664 {
3665 if (permute_results.is_empty ())
3666 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3667 elts, number_of_vectors,
3668 permute_results);
3669 vec_cst = permute_results[number_of_vectors - j - 1];
3670 }
3671 tree init;
3672 if (insert_after)
3673 {
3674 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
3675 /* vect_init_vector inserts before. */
3676 gsi_next (&gsi);
3677 init = vect_init_vector (vinfo, NULL, vec_cst,
3678 vector_type, &gsi);
3679 }
3680 else
3681 init = vect_init_vector (vinfo, NULL, vec_cst,
3682 vector_type, NULL);
3683 if (ctor_seq != NULL)
3684 {
3685 gimple_stmt_iterator gsi
3686 = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3687 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3688 ctor_seq = NULL;
3689 }
3690 voprnds.quick_push (init);
3691 insert_after = NULL;
3692 number_of_places_left_in_vector = nunits;
3693 constant_p = true;
3694 elts.new_vector (vector_type, nunits, 1);
3695 elts.quick_grow (nunits);
3696 }
3697 }
3698 }
3699
3700 /* Since the vectors are created in the reverse order, we should invert
3701 them. */
3702 vec_num = voprnds.length ();
3703 for (j = vec_num; j != 0; j--)
3704 {
3705 vop = voprnds[j - 1];
3706 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3707 }
3708
3709 /* In case that VF is greater than the unrolling factor needed for the SLP
3710 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3711 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3712 to replicate the vectors. */
3713 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3714 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3715 i++)
3716 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3717 }
3718
3719 /* Get the Ith vectorized definition from SLP_NODE. */
3720
3721 tree
3722 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3723 {
3724 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3725 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3726 else
3727 return SLP_TREE_VEC_DEFS (slp_node)[i];
3728 }
3729
3730 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3731
3732 void
3733 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3734 {
3735 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3736 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3737 {
3738 unsigned j;
3739 gimple *vec_def_stmt;
3740 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3741 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3742 }
3743 else
3744 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3745 }
3746
3747 /* Get N vectorized definitions for SLP_NODE. */
3748
3749 void
3750 vect_get_slp_defs (vec_info *,
3751 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3752 {
3753 if (n == -1U)
3754 n = SLP_TREE_CHILDREN (slp_node).length ();
3755
3756 for (unsigned i = 0; i < n; ++i)
3757 {
3758 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3759 vec<tree> vec_defs = vNULL;
3760 vect_get_slp_defs (child, &vec_defs);
3761 vec_oprnds->quick_push (vec_defs);
3762 }
3763 }
3764
3765 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3766 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3767 permute statements for the SLP node NODE. */
3768
3769 bool
3770 vect_transform_slp_perm_load (vec_info *vinfo,
3771 slp_tree node, vec<tree> dr_chain,
3772 gimple_stmt_iterator *gsi, poly_uint64 vf,
3773 bool analyze_only, unsigned *n_perms)
3774 {
3775 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3776 int vec_index = 0;
3777 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3778 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3779 unsigned int mask_element;
3780 machine_mode mode;
3781
3782 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3783 return false;
3784
3785 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3786
3787 mode = TYPE_MODE (vectype);
3788 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3789
3790 /* Initialize the vect stmts of NODE to properly insert the generated
3791 stmts later. */
3792 if (! analyze_only)
3793 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3794 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3795 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3796
3797 /* Generate permutation masks for every NODE. Number of masks for each NODE
3798 is equal to GROUP_SIZE.
3799 E.g., we have a group of three nodes with three loads from the same
3800 location in each node, and the vector size is 4. I.e., we have a
3801 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3802 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3803 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3804 ...
3805
3806 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3807 The last mask is illegal since we assume two operands for permute
3808 operation, and the mask element values can't be outside that range.
3809 Hence, the last mask must be converted into {2,5,5,5}.
3810 For the first two permutations we need the first and the second input
3811 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3812 we need the second and the third vectors: {b1,c1,a2,b2} and
3813 {c2,a3,b3,c3}. */
3814
3815 int vect_stmts_counter = 0;
3816 unsigned int index = 0;
3817 int first_vec_index = -1;
3818 int second_vec_index = -1;
3819 bool noop_p = true;
3820 *n_perms = 0;
3821
3822 vec_perm_builder mask;
3823 unsigned int nelts_to_build;
3824 unsigned int nvectors_per_build;
3825 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3826 && multiple_p (nunits, group_size));
3827 if (repeating_p)
3828 {
3829 /* A single vector contains a whole number of copies of the node, so:
3830 (a) all permutes can use the same mask; and
3831 (b) the permutes only need a single vector input. */
3832 mask.new_vector (nunits, group_size, 3);
3833 nelts_to_build = mask.encoded_nelts ();
3834 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3835 }
3836 else
3837 {
3838 /* We need to construct a separate mask for each vector statement. */
3839 unsigned HOST_WIDE_INT const_nunits, const_vf;
3840 if (!nunits.is_constant (&const_nunits)
3841 || !vf.is_constant (&const_vf))
3842 return false;
3843 mask.new_vector (const_nunits, const_nunits, 1);
3844 nelts_to_build = const_vf * group_size;
3845 nvectors_per_build = 1;
3846 }
3847
3848 unsigned int count = mask.encoded_nelts ();
3849 mask.quick_grow (count);
3850 vec_perm_indices indices;
3851
3852 for (unsigned int j = 0; j < nelts_to_build; j++)
3853 {
3854 unsigned int iter_num = j / group_size;
3855 unsigned int stmt_num = j % group_size;
3856 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3857 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3858 if (repeating_p)
3859 {
3860 first_vec_index = 0;
3861 mask_element = i;
3862 }
3863 else
3864 {
3865 /* Enforced before the loop when !repeating_p. */
3866 unsigned int const_nunits = nunits.to_constant ();
3867 vec_index = i / const_nunits;
3868 mask_element = i % const_nunits;
3869 if (vec_index == first_vec_index
3870 || first_vec_index == -1)
3871 {
3872 first_vec_index = vec_index;
3873 }
3874 else if (vec_index == second_vec_index
3875 || second_vec_index == -1)
3876 {
3877 second_vec_index = vec_index;
3878 mask_element += const_nunits;
3879 }
3880 else
3881 {
3882 if (dump_enabled_p ())
3883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3884 "permutation requires at "
3885 "least three vectors %G",
3886 stmt_info->stmt);
3887 gcc_assert (analyze_only);
3888 return false;
3889 }
3890
3891 gcc_assert (mask_element < 2 * const_nunits);
3892 }
3893
3894 if (mask_element != index)
3895 noop_p = false;
3896 mask[index++] = mask_element;
3897
3898 if (index == count && !noop_p)
3899 {
3900 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3901 if (!can_vec_perm_const_p (mode, indices))
3902 {
3903 if (dump_enabled_p ())
3904 {
3905 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3906 vect_location,
3907 "unsupported vect permute { ");
3908 for (i = 0; i < count; ++i)
3909 {
3910 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3911 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3912 }
3913 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3914 }
3915 gcc_assert (analyze_only);
3916 return false;
3917 }
3918
3919 ++*n_perms;
3920 }
3921
3922 if (index == count)
3923 {
3924 if (!analyze_only)
3925 {
3926 tree mask_vec = NULL_TREE;
3927
3928 if (! noop_p)
3929 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3930
3931 if (second_vec_index == -1)
3932 second_vec_index = first_vec_index;
3933
3934 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3935 {
3936 /* Generate the permute statement if necessary. */
3937 tree first_vec = dr_chain[first_vec_index + ri];
3938 tree second_vec = dr_chain[second_vec_index + ri];
3939 gimple *perm_stmt;
3940 if (! noop_p)
3941 {
3942 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3943 tree perm_dest
3944 = vect_create_destination_var (gimple_assign_lhs (stmt),
3945 vectype);
3946 perm_dest = make_ssa_name (perm_dest);
3947 perm_stmt
3948 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3949 first_vec, second_vec,
3950 mask_vec);
3951 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
3952 gsi);
3953 }
3954 else
3955 /* If mask was NULL_TREE generate the requested
3956 identity transform. */
3957 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
3958
3959 /* Store the vector statement in NODE. */
3960 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
3961 }
3962 }
3963
3964 index = 0;
3965 first_vec_index = -1;
3966 second_vec_index = -1;
3967 noop_p = true;
3968 }
3969 }
3970
3971 return true;
3972 }
3973
3974
3975 /* Vectorize the SLP permutations in NODE as specified
3976 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
3977 child number and lane number.
3978 Interleaving of two two-lane two-child SLP subtrees (not supported):
3979 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
3980 A blend of two four-lane two-child SLP subtrees:
3981 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
3982 Highpart of a four-lane one-child SLP subtree (not supported):
3983 [ { 0, 2 }, { 0, 3 } ]
3984 Where currently only a subset is supported by code generating below. */
3985
3986 static bool
3987 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
3988 slp_tree node, stmt_vector_for_cost *cost_vec)
3989 {
3990 tree vectype = SLP_TREE_VECTYPE (node);
3991
3992 /* ??? We currently only support all same vector input and output types
3993 while the SLP IL should really do a concat + select and thus accept
3994 arbitrary mismatches. */
3995 slp_tree child;
3996 unsigned i;
3997 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3998 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
3999 {
4000 if (dump_enabled_p ())
4001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4002 "Unsupported lane permutation\n");
4003 return false;
4004 }
4005
4006 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4007 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4008 if (dump_enabled_p ())
4009 {
4010 dump_printf_loc (MSG_NOTE, vect_location,
4011 "vectorizing permutation");
4012 for (unsigned i = 0; i < perm.length (); ++i)
4013 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4014 dump_printf (MSG_NOTE, "\n");
4015 }
4016
4017 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4018 if (!nunits.is_constant ())
4019 return false;
4020 unsigned HOST_WIDE_INT vf = 1;
4021 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4022 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4023 return false;
4024 unsigned olanes = vf * SLP_TREE_LANES (node);
4025 gcc_assert (multiple_p (olanes, nunits));
4026
4027 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4028 from the { SLP operand, scalar lane } permutation as recorded in the
4029 SLP node as intermediate step. This part should already work
4030 with SLP children with arbitrary number of lanes. */
4031 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4032 auto_vec<unsigned> active_lane;
4033 vperm.create (olanes);
4034 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4035 for (unsigned i = 0; i < vf; ++i)
4036 {
4037 for (unsigned pi = 0; pi < perm.length (); ++pi)
4038 {
4039 std::pair<unsigned, unsigned> p = perm[pi];
4040 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4041 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4042 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4043 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4044 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4045 }
4046 /* Advance to the next group. */
4047 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4048 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4049 }
4050
4051 if (dump_enabled_p ())
4052 {
4053 dump_printf_loc (MSG_NOTE, vect_location, "as");
4054 for (unsigned i = 0; i < vperm.length (); ++i)
4055 {
4056 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4057 dump_printf (MSG_NOTE, ",");
4058 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4059 vperm[i].first.first, vperm[i].first.second,
4060 vperm[i].first.second);
4061 }
4062 dump_printf (MSG_NOTE, "\n");
4063 }
4064
4065 /* We can only handle two-vector permutes, everything else should
4066 be lowered on the SLP level. The following is closely inspired
4067 by vect_transform_slp_perm_load and is supposed to eventually
4068 replace it.
4069 ??? As intermediate step do code-gen in the SLP tree representation
4070 somehow? */
4071 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4072 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4073 unsigned int const_nunits = nunits.to_constant ();
4074 unsigned int index = 0;
4075 unsigned int mask_element;
4076 vec_perm_builder mask;
4077 mask.new_vector (const_nunits, const_nunits, 1);
4078 unsigned int count = mask.encoded_nelts ();
4079 mask.quick_grow (count);
4080 vec_perm_indices indices;
4081 unsigned nperms = 0;
4082 for (unsigned i = 0; i < vperm.length (); ++i)
4083 {
4084 mask_element = vperm[i].second;
4085 if (first_vec.first == -1U
4086 || first_vec == vperm[i].first)
4087 first_vec = vperm[i].first;
4088 else if (second_vec.first == -1U
4089 || second_vec == vperm[i].first)
4090 {
4091 second_vec = vperm[i].first;
4092 mask_element += const_nunits;
4093 }
4094 else
4095 {
4096 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4098 "permutation requires at "
4099 "least three vectors");
4100 gcc_assert (!gsi);
4101 return false;
4102 }
4103
4104 mask[index++] = mask_element;
4105
4106 if (index == count)
4107 {
4108 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4109 const_nunits);
4110 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4111 {
4112 if (dump_enabled_p ())
4113 {
4114 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4115 vect_location,
4116 "unsupported vect permute { ");
4117 for (i = 0; i < count; ++i)
4118 {
4119 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4120 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4121 }
4122 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4123 }
4124 gcc_assert (!gsi);
4125 return false;
4126 }
4127
4128 nperms++;
4129 if (gsi)
4130 {
4131 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4132
4133 if (second_vec.first == -1U)
4134 second_vec = first_vec;
4135
4136 /* Generate the permute statement if necessary. */
4137 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4138 tree first_def
4139 = vect_get_slp_vect_def (first_node, first_vec.second);
4140 slp_tree second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
4141 tree second_def
4142 = vect_get_slp_vect_def (second_node, second_vec.second);
4143 tree perm_dest = make_ssa_name (vectype);
4144 gassign *perm_stmt
4145 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4146 first_def, second_def,
4147 mask_vec);
4148 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4149 /* Store the vector statement in NODE. */
4150 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4151 }
4152
4153 index = 0;
4154 first_vec = std::make_pair (-1U, -1U);
4155 second_vec = std::make_pair (-1U, -1U);
4156 }
4157 }
4158
4159 if (!gsi)
4160 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4161
4162 return true;
4163 }
4164
4165 /* Vectorize SLP instance tree in postorder. */
4166
4167 static void
4168 vect_schedule_slp_instance (vec_info *vinfo,
4169 slp_tree node, slp_instance instance)
4170 {
4171 gimple_stmt_iterator si;
4172 int i;
4173 slp_tree child;
4174
4175 /* See if we have already vectorized the node in the graph of the
4176 SLP instance. */
4177 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4178 && SLP_TREE_VEC_STMTS (node).exists ())
4179 || SLP_TREE_VEC_DEFS (node).exists ())
4180 return;
4181
4182 /* Vectorize externals and constants. */
4183 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4184 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4185 {
4186 /* ??? vectorizable_shift can end up using a scalar operand which is
4187 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4188 node in this case. */
4189 if (!SLP_TREE_VECTYPE (node))
4190 return;
4191
4192 vect_create_constant_vectors (vinfo, node);
4193 return;
4194 }
4195
4196 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4197 vect_schedule_slp_instance (vinfo, child, instance);
4198
4199 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4200 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4201
4202 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4203 if (dump_enabled_p ())
4204 dump_printf_loc (MSG_NOTE, vect_location,
4205 "------>vectorizing SLP node starting from: %G",
4206 stmt_info->stmt);
4207
4208 /* Vectorized stmts go before the last scalar stmt which is where
4209 all uses are ready. */
4210 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4211 if (last_stmt_info)
4212 si = gsi_for_stmt (last_stmt_info->stmt);
4213 else
4214 {
4215 /* Or if we do not have 1:1 matching scalar stmts emit after the
4216 children vectorized defs. */
4217 gimple *last_stmt = NULL;
4218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4219 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4220 {
4221 /* We are emitting all vectorized stmts in the same place and
4222 the last one is the last.
4223 ??? Unless we have a load permutation applied and that
4224 figures to re-use an earlier generated load. */
4225 unsigned j;
4226 gimple *vstmt;
4227 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4228 if (!last_stmt
4229 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4230 last_stmt = vstmt;
4231 }
4232 else
4233 {
4234 /* For externals we have to look at all defs since their
4235 insertion place is decided per vector. */
4236 unsigned j;
4237 tree vdef;
4238 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4239 if (TREE_CODE (vdef) == SSA_NAME)
4240 {
4241 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4242 if (!last_stmt
4243 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4244 last_stmt = vstmt;
4245 }
4246 }
4247 if (is_a <gphi *> (last_stmt))
4248 si = gsi_after_labels (gimple_bb (last_stmt));
4249 else
4250 {
4251 si = gsi_for_stmt (last_stmt);
4252 gsi_next (&si);
4253 }
4254 }
4255
4256 bool done_p = false;
4257
4258 /* Handle purely internal nodes. */
4259 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4260 {
4261 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4262 be shared with different SLP nodes (but usually it's the same
4263 operation apart from the case the stmt is only there for denoting
4264 the actual scalar lane defs ...). So do not call vect_transform_stmt
4265 but open-code it here (partly). */
4266 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4267 gcc_assert (done);
4268 done_p = true;
4269 }
4270 if (!done_p)
4271 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4272 }
4273
4274 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4275 For loop vectorization this is done in vectorizable_call, but for SLP
4276 it needs to be deferred until end of vect_schedule_slp, because multiple
4277 SLP instances may refer to the same scalar stmt. */
4278
4279 static void
4280 vect_remove_slp_scalar_calls (vec_info *vinfo,
4281 slp_tree node, hash_set<slp_tree> &visited)
4282 {
4283 gimple *new_stmt;
4284 gimple_stmt_iterator gsi;
4285 int i;
4286 slp_tree child;
4287 tree lhs;
4288 stmt_vec_info stmt_info;
4289
4290 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4291 return;
4292
4293 if (visited.add (node))
4294 return;
4295
4296 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4297 vect_remove_slp_scalar_calls (vinfo, child, visited);
4298
4299 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4300 {
4301 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4302 if (!stmt || gimple_bb (stmt) == NULL)
4303 continue;
4304 if (is_pattern_stmt_p (stmt_info)
4305 || !PURE_SLP_STMT (stmt_info))
4306 continue;
4307 lhs = gimple_call_lhs (stmt);
4308 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4309 gsi = gsi_for_stmt (stmt);
4310 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4311 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4312 }
4313 }
4314
4315 static void
4316 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4317 {
4318 hash_set<slp_tree> visited;
4319 vect_remove_slp_scalar_calls (vinfo, node, visited);
4320 }
4321
4322 /* Vectorize the instance root. */
4323
4324 void
4325 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4326 {
4327 gassign *rstmt = NULL;
4328
4329 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4330 {
4331 gimple *child_stmt;
4332 int j;
4333
4334 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4335 {
4336 tree vect_lhs = gimple_get_lhs (child_stmt);
4337 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4338 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4339 TREE_TYPE (vect_lhs)))
4340 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4341 vect_lhs);
4342 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4343 break;
4344 }
4345 }
4346 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4347 {
4348 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4349 gimple *child_stmt;
4350 int j;
4351 vec<constructor_elt, va_gc> *v;
4352 vec_alloc (v, nelts);
4353
4354 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4355 {
4356 CONSTRUCTOR_APPEND_ELT (v,
4357 NULL_TREE,
4358 gimple_get_lhs (child_stmt));
4359 }
4360 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4361 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4362 tree r_constructor = build_constructor (rtype, v);
4363 rstmt = gimple_build_assign (lhs, r_constructor);
4364 }
4365
4366 gcc_assert (rstmt);
4367
4368 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4369 gsi_replace (&rgsi, rstmt, true);
4370 }
4371
4372 /* Generate vector code for all SLP instances in the loop/basic block. */
4373
4374 void
4375 vect_schedule_slp (vec_info *vinfo)
4376 {
4377 vec<slp_instance> slp_instances;
4378 slp_instance instance;
4379 unsigned int i;
4380
4381 slp_instances = vinfo->slp_instances;
4382 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4383 {
4384 slp_tree node = SLP_INSTANCE_TREE (instance);
4385 /* Schedule the tree of INSTANCE. */
4386 vect_schedule_slp_instance (vinfo, node, instance);
4387
4388 if (SLP_INSTANCE_ROOT_STMT (instance))
4389 vectorize_slp_instance_root_stmt (node, instance);
4390
4391 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_NOTE, vect_location,
4393 "vectorizing stmts using SLP.\n");
4394 }
4395
4396 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4397 {
4398 slp_tree root = SLP_INSTANCE_TREE (instance);
4399 stmt_vec_info store_info;
4400 unsigned int j;
4401
4402 /* Remove scalar call stmts. Do not do this for basic-block
4403 vectorization as not all uses may be vectorized.
4404 ??? Why should this be necessary? DCE should be able to
4405 remove the stmts itself.
4406 ??? For BB vectorization we can as well remove scalar
4407 stmts starting from the SLP tree root if they have no
4408 uses. */
4409 if (is_a <loop_vec_info> (vinfo))
4410 vect_remove_slp_scalar_calls (vinfo, root);
4411
4412 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4413 {
4414 if (!STMT_VINFO_DATA_REF (store_info))
4415 break;
4416
4417 if (SLP_INSTANCE_ROOT_STMT (instance))
4418 continue;
4419
4420 store_info = vect_orig_stmt (store_info);
4421 /* Free the attached stmt_vec_info and remove the stmt. */
4422 vinfo->remove_stmt (store_info);
4423 }
4424 }
4425 }