remove STMT_VINFO_NUM_SLP_USES
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
52
53 /* Initialize a SLP node. */
54
55 _slp_tree::_slp_tree ()
56 {
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
72 }
73
74 /* Tear down a SLP node. */
75
76 _slp_tree::~_slp_tree ()
77 {
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
85 }
86
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
88
89 static void
90 vect_free_slp_tree (slp_tree node)
91 {
92 int i;
93 slp_tree child;
94
95 if (--node->refcnt != 0)
96 return;
97
98 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
99 vect_free_slp_tree (child);
100
101 delete node;
102 }
103
104 /* Return a location suitable for dumpings related to the SLP instance. */
105
106 dump_user_location_t
107 _slp_instance::location () const
108 {
109 if (root_stmt)
110 return root_stmt->stmt;
111 else
112 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
113 }
114
115
116 /* Free the memory allocated for the SLP instance. */
117
118 void
119 vect_free_slp_instance (slp_instance instance)
120 {
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
122 SLP_INSTANCE_LOADS (instance).release ();
123 instance->subgraph_entries.release ();
124 instance->cost_vec.release ();
125 free (instance);
126 }
127
128
129 /* Create an SLP node for SCALAR_STMTS. */
130
131 static slp_tree
132 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
133 {
134 slp_tree node = new _slp_tree;
135 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
136 SLP_TREE_CHILDREN (node).create (nops);
137 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
138 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
139 SLP_TREE_LANES (node) = scalar_stmts.length ();
140 return node;
141 }
142
143 /* Create an SLP node for OPS. */
144
145 static slp_tree
146 vect_create_new_slp_node (vec<tree> ops)
147 {
148 slp_tree node = new _slp_tree;
149 SLP_TREE_SCALAR_OPS (node) = ops;
150 SLP_TREE_DEF_TYPE (node) = vect_external_def;
151 SLP_TREE_LANES (node) = ops.length ();
152 return node;
153 }
154
155
156 /* This structure is used in creation of an SLP tree. Each instance
157 corresponds to the same operand in a group of scalar stmts in an SLP
158 node. */
159 typedef struct _slp_oprnd_info
160 {
161 /* Def-stmts for the operands. */
162 vec<stmt_vec_info> def_stmts;
163 /* Operands. */
164 vec<tree> ops;
165 /* Information about the first statement, its vector def-type, type, the
166 operand itself in case it's constant, and an indication if it's a pattern
167 stmt. */
168 tree first_op_type;
169 enum vect_def_type first_dt;
170 bool any_pattern;
171 } *slp_oprnd_info;
172
173
174 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
175 operand. */
176 static vec<slp_oprnd_info>
177 vect_create_oprnd_info (int nops, int group_size)
178 {
179 int i;
180 slp_oprnd_info oprnd_info;
181 vec<slp_oprnd_info> oprnds_info;
182
183 oprnds_info.create (nops);
184 for (i = 0; i < nops; i++)
185 {
186 oprnd_info = XNEW (struct _slp_oprnd_info);
187 oprnd_info->def_stmts.create (group_size);
188 oprnd_info->ops.create (group_size);
189 oprnd_info->first_dt = vect_uninitialized_def;
190 oprnd_info->first_op_type = NULL_TREE;
191 oprnd_info->any_pattern = false;
192 oprnds_info.quick_push (oprnd_info);
193 }
194
195 return oprnds_info;
196 }
197
198
199 /* Free operands info. */
200
201 static void
202 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
203 {
204 int i;
205 slp_oprnd_info oprnd_info;
206
207 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
208 {
209 oprnd_info->def_stmts.release ();
210 oprnd_info->ops.release ();
211 XDELETE (oprnd_info);
212 }
213
214 oprnds_info.release ();
215 }
216
217
218 /* Return true if STMTS contains a pattern statement. */
219
220 static bool
221 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
222 {
223 stmt_vec_info stmt_info;
224 unsigned int i;
225 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
226 if (is_pattern_stmt_p (stmt_info))
227 return true;
228 return false;
229 }
230
231 /* Return true when all lanes in the external or constant NODE have
232 the same value. */
233
234 static bool
235 vect_slp_tree_uniform_p (slp_tree node)
236 {
237 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
238 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
239
240 /* Pre-exsting vectors. */
241 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
242 return false;
243
244 unsigned i;
245 tree op, first = NULL_TREE;
246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
247 if (!first)
248 first = op;
249 else if (!operand_equal_p (first, op, 0))
250 return false;
251
252 return true;
253 }
254
255 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
256 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
257 of the chain. */
258
259 int
260 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
261 stmt_vec_info first_stmt_info)
262 {
263 stmt_vec_info next_stmt_info = first_stmt_info;
264 int result = 0;
265
266 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
267 return -1;
268
269 do
270 {
271 if (next_stmt_info == stmt_info)
272 return result;
273 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
274 if (next_stmt_info)
275 result += DR_GROUP_GAP (next_stmt_info);
276 }
277 while (next_stmt_info);
278
279 return -1;
280 }
281
282 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
283 using the method implemented by duplicate_and_interleave. Return true
284 if so, returning the number of intermediate vectors in *NVECTORS_OUT
285 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
286 (if nonnull). */
287
288 bool
289 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
290 tree elt_type, unsigned int *nvectors_out,
291 tree *vector_type_out,
292 tree *permutes)
293 {
294 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
295 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
296 return false;
297
298 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
299 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
300 unsigned int nvectors = 1;
301 for (;;)
302 {
303 scalar_int_mode int_mode;
304 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
305 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
306 {
307 /* Get the natural vector type for this SLP group size. */
308 tree int_type = build_nonstandard_integer_type
309 (GET_MODE_BITSIZE (int_mode), 1);
310 tree vector_type
311 = get_vectype_for_scalar_type (vinfo, int_type, count);
312 if (vector_type
313 && VECTOR_MODE_P (TYPE_MODE (vector_type))
314 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
315 GET_MODE_SIZE (base_vector_mode)))
316 {
317 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
318 together into elements of type INT_TYPE and using the result
319 to build NVECTORS vectors. */
320 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
321 vec_perm_builder sel1 (nelts, 2, 3);
322 vec_perm_builder sel2 (nelts, 2, 3);
323 poly_int64 half_nelts = exact_div (nelts, 2);
324 for (unsigned int i = 0; i < 3; ++i)
325 {
326 sel1.quick_push (i);
327 sel1.quick_push (i + nelts);
328 sel2.quick_push (half_nelts + i);
329 sel2.quick_push (half_nelts + i + nelts);
330 }
331 vec_perm_indices indices1 (sel1, 2, nelts);
332 vec_perm_indices indices2 (sel2, 2, nelts);
333 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
334 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
335 {
336 if (nvectors_out)
337 *nvectors_out = nvectors;
338 if (vector_type_out)
339 *vector_type_out = vector_type;
340 if (permutes)
341 {
342 permutes[0] = vect_gen_perm_mask_checked (vector_type,
343 indices1);
344 permutes[1] = vect_gen_perm_mask_checked (vector_type,
345 indices2);
346 }
347 return true;
348 }
349 }
350 }
351 if (!multiple_p (elt_bytes, 2, &elt_bytes))
352 return false;
353 nvectors *= 2;
354 }
355 }
356
357 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
358 they are of a valid type and that they match the defs of the first stmt of
359 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
360 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
361 indicates swap is required for cond_expr stmts. Specifically, *SWAP
362 is 1 if STMT is cond and operands of comparison need to be swapped;
363 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
364 If there is any operand swap in this function, *SWAP is set to non-zero
365 value.
366 If there was a fatal error return -1; if the error could be corrected by
367 swapping operands of father node of this one, return 1; if everything is
368 ok return 0. */
369 static int
370 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
371 vec<stmt_vec_info> stmts, unsigned stmt_num,
372 vec<slp_oprnd_info> *oprnds_info)
373 {
374 stmt_vec_info stmt_info = stmts[stmt_num];
375 tree oprnd;
376 unsigned int i, number_of_oprnds;
377 enum vect_def_type dt = vect_uninitialized_def;
378 slp_oprnd_info oprnd_info;
379 int first_op_idx = 1;
380 unsigned int commutative_op = -1U;
381 bool first_op_cond = false;
382 bool first = stmt_num == 0;
383
384 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
385 {
386 number_of_oprnds = gimple_call_num_args (stmt);
387 first_op_idx = 3;
388 if (gimple_call_internal_p (stmt))
389 {
390 internal_fn ifn = gimple_call_internal_fn (stmt);
391 commutative_op = first_commutative_argument (ifn);
392
393 /* Masked load, only look at mask. */
394 if (ifn == IFN_MASK_LOAD)
395 {
396 number_of_oprnds = 1;
397 /* Mask operand index. */
398 first_op_idx = 5;
399 }
400 }
401 }
402 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
403 {
404 enum tree_code code = gimple_assign_rhs_code (stmt);
405 number_of_oprnds = gimple_num_ops (stmt) - 1;
406 /* Swap can only be done for cond_expr if asked to, otherwise we
407 could result in different comparison code to the first stmt. */
408 if (code == COND_EXPR
409 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
410 {
411 first_op_cond = true;
412 number_of_oprnds++;
413 }
414 else
415 commutative_op = commutative_tree_code (code) ? 0U : -1U;
416 }
417 else
418 return -1;
419
420 bool swapped = (swap != 0);
421 gcc_assert (!swapped || first_op_cond);
422 for (i = 0; i < number_of_oprnds; i++)
423 {
424 again:
425 if (first_op_cond)
426 {
427 /* Map indicating how operands of cond_expr should be swapped. */
428 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
429 int *map = maps[swap];
430
431 if (i < 2)
432 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
433 first_op_idx), map[i]);
434 else
435 oprnd = gimple_op (stmt_info->stmt, map[i]);
436 }
437 else
438 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
439 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
440 oprnd = TREE_OPERAND (oprnd, 0);
441
442 oprnd_info = (*oprnds_info)[i];
443
444 stmt_vec_info def_stmt_info;
445 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
446 {
447 if (dump_enabled_p ())
448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
449 "Build SLP failed: can't analyze def for %T\n",
450 oprnd);
451
452 return -1;
453 }
454
455 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
456 oprnd_info->any_pattern = true;
457
458 if (first)
459 {
460 /* For the swapping logic below force vect_reduction_def
461 for the reduction op in a SLP reduction group. */
462 if (!STMT_VINFO_DATA_REF (stmt_info)
463 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
464 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
465 && def_stmt_info)
466 dt = vect_reduction_def;
467 oprnd_info->first_dt = dt;
468 oprnd_info->first_op_type = TREE_TYPE (oprnd);
469 }
470 else
471 {
472 /* Not first stmt of the group, check that the def-stmt/s match
473 the def-stmt/s of the first stmt. Allow different definition
474 types for reduction chains: the first stmt must be a
475 vect_reduction_def (a phi node), and the rest
476 end in the reduction chain. */
477 tree type = TREE_TYPE (oprnd);
478 if ((oprnd_info->first_dt != dt
479 && !(oprnd_info->first_dt == vect_reduction_def
480 && !STMT_VINFO_DATA_REF (stmt_info)
481 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
482 && def_stmt_info
483 && !STMT_VINFO_DATA_REF (def_stmt_info)
484 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
485 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
486 && !((oprnd_info->first_dt == vect_external_def
487 || oprnd_info->first_dt == vect_constant_def)
488 && (dt == vect_external_def
489 || dt == vect_constant_def)))
490 || !types_compatible_p (oprnd_info->first_op_type, type)
491 || (!STMT_VINFO_DATA_REF (stmt_info)
492 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
493 && ((!def_stmt_info
494 || STMT_VINFO_DATA_REF (def_stmt_info)
495 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
496 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
497 != (oprnd_info->first_dt != vect_reduction_def))))
498 {
499 /* Try swapping operands if we got a mismatch. */
500 if (i == commutative_op && !swapped)
501 {
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "trying swapped operands\n");
505 swapped = true;
506 goto again;
507 }
508
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
511 "Build SLP failed: different types\n");
512
513 return 1;
514 }
515 if ((dt == vect_constant_def
516 || dt == vect_external_def)
517 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
518 && (TREE_CODE (type) == BOOLEAN_TYPE
519 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
520 type)))
521 {
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
524 "Build SLP failed: invalid type of def "
525 "for variable-length SLP %T\n", oprnd);
526 return -1;
527 }
528 }
529
530 /* Check the types of the definitions. */
531 switch (dt)
532 {
533 case vect_external_def:
534 /* Make sure to demote the overall operand to external. */
535 oprnd_info->first_dt = vect_external_def;
536 /* Fallthru. */
537 case vect_constant_def:
538 oprnd_info->def_stmts.quick_push (NULL);
539 oprnd_info->ops.quick_push (oprnd);
540 break;
541
542 case vect_internal_def:
543 case vect_reduction_def:
544 if (oprnd_info->first_dt == vect_reduction_def
545 && !STMT_VINFO_DATA_REF (stmt_info)
546 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
547 && !STMT_VINFO_DATA_REF (def_stmt_info)
548 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
549 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
550 {
551 /* For a SLP reduction chain we want to duplicate the
552 reduction to each of the chain members. That gets
553 us a sane SLP graph (still the stmts are not 100%
554 correct wrt the initial values). */
555 gcc_assert (!first);
556 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
557 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
558 break;
559 }
560 /* Fallthru. */
561 case vect_induction_def:
562 oprnd_info->def_stmts.quick_push (def_stmt_info);
563 oprnd_info->ops.quick_push (oprnd);
564 break;
565
566 default:
567 /* FORNOW: Not supported. */
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "Build SLP failed: illegal type of def %T\n",
571 oprnd);
572
573 return -1;
574 }
575 }
576
577 /* Swap operands. */
578 if (swapped)
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "swapped operands to match def types in %G",
583 stmt_info->stmt);
584 }
585
586 return 0;
587 }
588
589 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
590 Return true if we can, meaning that this choice doesn't conflict with
591 existing SLP nodes that use STMT_INFO. */
592
593 bool
594 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
595 {
596 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
597 if (old_vectype)
598 return useless_type_conversion_p (vectype, old_vectype);
599
600 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
601 {
602 /* We maintain the invariant that if any statement in the group is
603 used, all other members of the group have the same vector type. */
604 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
605 stmt_vec_info member_info = first_info;
606 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
607 if (is_pattern_stmt_p (member_info)
608 && !useless_type_conversion_p (vectype,
609 STMT_VINFO_VECTYPE (member_info)))
610 break;
611
612 if (!member_info)
613 {
614 for (member_info = first_info; member_info;
615 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
616 STMT_VINFO_VECTYPE (member_info) = vectype;
617 return true;
618 }
619 }
620 else if (!is_pattern_stmt_p (stmt_info))
621 {
622 STMT_VINFO_VECTYPE (stmt_info) = vectype;
623 return true;
624 }
625
626 if (dump_enabled_p ())
627 {
628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
629 "Build SLP failed: incompatible vector"
630 " types for: %G", stmt_info->stmt);
631 dump_printf_loc (MSG_NOTE, vect_location,
632 " old vector type: %T\n", old_vectype);
633 dump_printf_loc (MSG_NOTE, vect_location,
634 " new vector type: %T\n", vectype);
635 }
636 return false;
637 }
638
639 /* Return true if call statements CALL1 and CALL2 are similar enough
640 to be combined into the same SLP group. */
641
642 static bool
643 compatible_calls_p (gcall *call1, gcall *call2)
644 {
645 unsigned int nargs = gimple_call_num_args (call1);
646 if (nargs != gimple_call_num_args (call2))
647 return false;
648
649 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
650 return false;
651
652 if (gimple_call_internal_p (call1))
653 {
654 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
655 TREE_TYPE (gimple_call_lhs (call2))))
656 return false;
657 for (unsigned int i = 0; i < nargs; ++i)
658 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
659 TREE_TYPE (gimple_call_arg (call2, i))))
660 return false;
661 }
662 else
663 {
664 if (!operand_equal_p (gimple_call_fn (call1),
665 gimple_call_fn (call2), 0))
666 return false;
667
668 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
669 return false;
670 }
671 return true;
672 }
673
674 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
675 caller's attempt to find the vector type in STMT_INFO with the narrowest
676 element type. Return true if VECTYPE is nonnull and if it is valid
677 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
678 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
679 vect_build_slp_tree. */
680
681 static bool
682 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
683 unsigned int group_size,
684 tree vectype, poly_uint64 *max_nunits)
685 {
686 if (!vectype)
687 {
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "Build SLP failed: unsupported data-type in %G\n",
691 stmt_info->stmt);
692 /* Fatal mismatch. */
693 return false;
694 }
695
696 /* If populating the vector type requires unrolling then fail
697 before adjusting *max_nunits for basic-block vectorization. */
698 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
699 unsigned HOST_WIDE_INT const_nunits;
700 if (is_a <bb_vec_info> (vinfo)
701 && (!nunits.is_constant (&const_nunits)
702 || const_nunits > group_size))
703 {
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "Build SLP failed: unrolling required "
707 "in basic block SLP\n");
708 /* Fatal mismatch. */
709 return false;
710 }
711
712 /* In case of multiple types we need to detect the smallest type. */
713 vect_update_max_nunits (max_nunits, vectype);
714 return true;
715 }
716
717 /* Verify if the scalar stmts STMTS are isomorphic, require data
718 permutation or are of unsupported types of operation. Return
719 true if they are, otherwise return false and indicate in *MATCHES
720 which stmts are not isomorphic to the first one. If MATCHES[0]
721 is false then this indicates the comparison could not be
722 carried out or the stmts will never be vectorized by SLP.
723
724 Note COND_EXPR is possibly isomorphic to another one after swapping its
725 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
726 the first stmt by swapping the two operands of comparison; set SWAP[i]
727 to 2 if stmt I is isormorphic to the first stmt by inverting the code
728 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
729 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
730
731 static bool
732 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
733 vec<stmt_vec_info> stmts, unsigned int group_size,
734 poly_uint64 *max_nunits, bool *matches,
735 bool *two_operators, tree *node_vectype)
736 {
737 unsigned int i;
738 stmt_vec_info first_stmt_info = stmts[0];
739 enum tree_code first_stmt_code = ERROR_MARK;
740 enum tree_code alt_stmt_code = ERROR_MARK;
741 enum tree_code rhs_code = ERROR_MARK;
742 enum tree_code first_cond_code = ERROR_MARK;
743 tree lhs;
744 bool need_same_oprnds = false;
745 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
746 optab optab;
747 int icode;
748 machine_mode optab_op2_mode;
749 machine_mode vec_mode;
750 stmt_vec_info first_load = NULL, prev_first_load = NULL;
751 bool first_stmt_load_p = false, load_p = false;
752
753 /* For every stmt in NODE find its def stmt/s. */
754 stmt_vec_info stmt_info;
755 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
756 {
757 gimple *stmt = stmt_info->stmt;
758 swap[i] = 0;
759 matches[i] = false;
760
761 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
763
764 /* Fail to vectorize statements marked as unvectorizable. */
765 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
766 {
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: unvectorizable statement %G",
770 stmt);
771 /* Fatal mismatch. */
772 matches[0] = false;
773 return false;
774 }
775
776 lhs = gimple_get_lhs (stmt);
777 if (lhs == NULL_TREE)
778 {
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: not GIMPLE_ASSIGN nor "
782 "GIMPLE_CALL %G", stmt);
783 /* Fatal mismatch. */
784 matches[0] = false;
785 return false;
786 }
787
788 tree nunits_vectype;
789 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
790 &nunits_vectype, group_size)
791 || (nunits_vectype
792 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
793 nunits_vectype, max_nunits)))
794 {
795 /* Fatal mismatch. */
796 matches[0] = false;
797 return false;
798 }
799
800 gcc_assert (vectype);
801
802 gcall *call_stmt = dyn_cast <gcall *> (stmt);
803 if (call_stmt)
804 {
805 rhs_code = CALL_EXPR;
806
807 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
808 load_p = true;
809 else if ((gimple_call_internal_p (call_stmt)
810 && (!vectorizable_internal_fn_p
811 (gimple_call_internal_fn (call_stmt))))
812 || gimple_call_tail_p (call_stmt)
813 || gimple_call_noreturn_p (call_stmt)
814 || !gimple_call_nothrow_p (call_stmt)
815 || gimple_call_chain (call_stmt))
816 {
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: unsupported call type %G",
820 call_stmt);
821 /* Fatal mismatch. */
822 matches[0] = false;
823 return false;
824 }
825 }
826 else
827 {
828 rhs_code = gimple_assign_rhs_code (stmt);
829 load_p = gimple_vuse (stmt);
830 }
831
832 /* Check the operation. */
833 if (i == 0)
834 {
835 *node_vectype = vectype;
836 first_stmt_code = rhs_code;
837 first_stmt_load_p = load_p;
838
839 /* Shift arguments should be equal in all the packed stmts for a
840 vector shift with scalar shift operand. */
841 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
842 || rhs_code == LROTATE_EXPR
843 || rhs_code == RROTATE_EXPR)
844 {
845 vec_mode = TYPE_MODE (vectype);
846
847 /* First see if we have a vector/vector shift. */
848 optab = optab_for_tree_code (rhs_code, vectype,
849 optab_vector);
850
851 if (!optab
852 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
853 {
854 /* No vector/vector shift, try for a vector/scalar shift. */
855 optab = optab_for_tree_code (rhs_code, vectype,
856 optab_scalar);
857
858 if (!optab)
859 {
860 if (dump_enabled_p ())
861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
862 "Build SLP failed: no optab.\n");
863 /* Fatal mismatch. */
864 matches[0] = false;
865 return false;
866 }
867 icode = (int) optab_handler (optab, vec_mode);
868 if (icode == CODE_FOR_nothing)
869 {
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
872 "Build SLP failed: "
873 "op not supported by target.\n");
874 /* Fatal mismatch. */
875 matches[0] = false;
876 return false;
877 }
878 optab_op2_mode = insn_data[icode].operand[2].mode;
879 if (!VECTOR_MODE_P (optab_op2_mode))
880 {
881 need_same_oprnds = true;
882 first_op1 = gimple_assign_rhs2 (stmt);
883 }
884 }
885 }
886 else if (rhs_code == WIDEN_LSHIFT_EXPR)
887 {
888 need_same_oprnds = true;
889 first_op1 = gimple_assign_rhs2 (stmt);
890 }
891 else if (!load_p
892 && rhs_code == BIT_FIELD_REF)
893 {
894 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
895 if (TREE_CODE (vec) != SSA_NAME
896 || !types_compatible_p (vectype, TREE_TYPE (vec)))
897 {
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "Build SLP failed: "
901 "BIT_FIELD_REF not supported\n");
902 /* Fatal mismatch. */
903 matches[0] = false;
904 return false;
905 }
906 }
907 else if (call_stmt
908 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
909 {
910 need_same_oprnds = true;
911 first_op1 = gimple_call_arg (call_stmt, 1);
912 }
913 }
914 else
915 {
916 if (first_stmt_code != rhs_code
917 && alt_stmt_code == ERROR_MARK)
918 alt_stmt_code = rhs_code;
919 if ((first_stmt_code != rhs_code
920 && (first_stmt_code != IMAGPART_EXPR
921 || rhs_code != REALPART_EXPR)
922 && (first_stmt_code != REALPART_EXPR
923 || rhs_code != IMAGPART_EXPR)
924 /* Handle mismatches in plus/minus by computing both
925 and merging the results. */
926 && !((first_stmt_code == PLUS_EXPR
927 || first_stmt_code == MINUS_EXPR)
928 && (alt_stmt_code == PLUS_EXPR
929 || alt_stmt_code == MINUS_EXPR)
930 && rhs_code == alt_stmt_code)
931 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
932 && (first_stmt_code == ARRAY_REF
933 || first_stmt_code == BIT_FIELD_REF
934 || first_stmt_code == INDIRECT_REF
935 || first_stmt_code == COMPONENT_REF
936 || first_stmt_code == MEM_REF)))
937 || first_stmt_load_p != load_p)
938 {
939 if (dump_enabled_p ())
940 {
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
942 "Build SLP failed: different operation "
943 "in stmt %G", stmt);
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "original stmt %G", first_stmt_info->stmt);
946 }
947 /* Mismatch. */
948 continue;
949 }
950
951 if (need_same_oprnds)
952 {
953 tree other_op1 = (call_stmt
954 ? gimple_call_arg (call_stmt, 1)
955 : gimple_assign_rhs2 (stmt));
956 if (!operand_equal_p (first_op1, other_op1, 0))
957 {
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960 "Build SLP failed: different shift "
961 "arguments in %G", stmt);
962 /* Mismatch. */
963 continue;
964 }
965 }
966 if (!load_p
967 && first_stmt_code == BIT_FIELD_REF
968 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
969 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
970 {
971 if (dump_enabled_p ())
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
973 "Build SLP failed: different BIT_FIELD_REF "
974 "arguments in %G", stmt);
975 /* Mismatch. */
976 continue;
977 }
978
979 if (!load_p && rhs_code == CALL_EXPR)
980 {
981 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
982 as_a <gcall *> (stmt)))
983 {
984 if (dump_enabled_p ())
985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
986 "Build SLP failed: different calls in %G",
987 stmt);
988 /* Mismatch. */
989 continue;
990 }
991 }
992
993 if (!types_compatible_p (vectype, *node_vectype))
994 {
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
997 "Build SLP failed: different vector type "
998 "in %G", stmt);
999 /* Mismatch. */
1000 continue;
1001 }
1002 }
1003
1004 /* Grouped store or load. */
1005 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1006 {
1007 if (REFERENCE_CLASS_P (lhs))
1008 {
1009 /* Store. */
1010 ;
1011 }
1012 else
1013 {
1014 /* Load. */
1015 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1016 if (prev_first_load)
1017 {
1018 /* Check that there are no loads from different interleaving
1019 chains in the same node. */
1020 if (prev_first_load != first_load)
1021 {
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1024 vect_location,
1025 "Build SLP failed: different "
1026 "interleaving chains in one node %G",
1027 stmt);
1028 /* Mismatch. */
1029 continue;
1030 }
1031 }
1032 else
1033 prev_first_load = first_load;
1034 }
1035 } /* Grouped access. */
1036 else
1037 {
1038 if (load_p)
1039 {
1040 /* Not grouped load. */
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Build SLP failed: not grouped load %G", stmt);
1044
1045 /* FORNOW: Not grouped loads are not supported. */
1046 /* Fatal mismatch. */
1047 matches[0] = false;
1048 return false;
1049 }
1050
1051 /* Not memory operation. */
1052 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1053 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1054 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1055 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1056 && rhs_code != VIEW_CONVERT_EXPR
1057 && rhs_code != CALL_EXPR
1058 && rhs_code != BIT_FIELD_REF)
1059 {
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "Build SLP failed: operation unsupported %G",
1063 stmt);
1064 /* Fatal mismatch. */
1065 matches[0] = false;
1066 return false;
1067 }
1068
1069 if (rhs_code == COND_EXPR)
1070 {
1071 tree cond_expr = gimple_assign_rhs1 (stmt);
1072 enum tree_code cond_code = TREE_CODE (cond_expr);
1073 enum tree_code swap_code = ERROR_MARK;
1074 enum tree_code invert_code = ERROR_MARK;
1075
1076 if (i == 0)
1077 first_cond_code = TREE_CODE (cond_expr);
1078 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1079 {
1080 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1081 swap_code = swap_tree_comparison (cond_code);
1082 invert_code = invert_tree_comparison (cond_code, honor_nans);
1083 }
1084
1085 if (first_cond_code == cond_code)
1086 ;
1087 /* Isomorphic can be achieved by swapping. */
1088 else if (first_cond_code == swap_code)
1089 swap[i] = 1;
1090 /* Isomorphic can be achieved by inverting. */
1091 else if (first_cond_code == invert_code)
1092 swap[i] = 2;
1093 else
1094 {
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097 "Build SLP failed: different"
1098 " operation %G", stmt);
1099 /* Mismatch. */
1100 continue;
1101 }
1102 }
1103 }
1104
1105 matches[i] = true;
1106 }
1107
1108 for (i = 0; i < group_size; ++i)
1109 if (!matches[i])
1110 return false;
1111
1112 /* If we allowed a two-operation SLP node verify the target can cope
1113 with the permute we are going to use. */
1114 if (alt_stmt_code != ERROR_MARK
1115 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1116 {
1117 *two_operators = true;
1118 }
1119
1120 return true;
1121 }
1122
1123 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1124 Note we never remove apart from at destruction time so we do not
1125 need a special value for deleted that differs from empty. */
1126 struct bst_traits
1127 {
1128 typedef vec <stmt_vec_info> value_type;
1129 typedef vec <stmt_vec_info> compare_type;
1130 static inline hashval_t hash (value_type);
1131 static inline bool equal (value_type existing, value_type candidate);
1132 static inline bool is_empty (value_type x) { return !x.exists (); }
1133 static inline bool is_deleted (value_type x) { return !x.exists (); }
1134 static const bool empty_zero_p = true;
1135 static inline void mark_empty (value_type &x) { x.release (); }
1136 static inline void mark_deleted (value_type &x) { x.release (); }
1137 static inline void remove (value_type &x) { x.release (); }
1138 };
1139 inline hashval_t
1140 bst_traits::hash (value_type x)
1141 {
1142 inchash::hash h;
1143 for (unsigned i = 0; i < x.length (); ++i)
1144 h.add_int (gimple_uid (x[i]->stmt));
1145 return h.end ();
1146 }
1147 inline bool
1148 bst_traits::equal (value_type existing, value_type candidate)
1149 {
1150 if (existing.length () != candidate.length ())
1151 return false;
1152 for (unsigned i = 0; i < existing.length (); ++i)
1153 if (existing[i] != candidate[i])
1154 return false;
1155 return true;
1156 }
1157
1158 typedef hash_map <vec <gimple *>, slp_tree,
1159 simple_hashmap_traits <bst_traits, slp_tree> >
1160 scalar_stmts_to_slp_tree_map_t;
1161
1162 static slp_tree
1163 vect_build_slp_tree_2 (vec_info *vinfo,
1164 vec<stmt_vec_info> stmts, unsigned int group_size,
1165 poly_uint64 *max_nunits,
1166 bool *matches, unsigned *npermutes, unsigned *tree_size,
1167 scalar_stmts_to_slp_tree_map_t *bst_map);
1168
1169 static slp_tree
1170 vect_build_slp_tree (vec_info *vinfo,
1171 vec<stmt_vec_info> stmts, unsigned int group_size,
1172 poly_uint64 *max_nunits,
1173 bool *matches, unsigned *npermutes, unsigned *tree_size,
1174 scalar_stmts_to_slp_tree_map_t *bst_map)
1175 {
1176 if (slp_tree *leader = bst_map->get (stmts))
1177 {
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1180 *leader ? "" : "failed ", *leader);
1181 if (*leader)
1182 {
1183 (*leader)->refcnt++;
1184 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1185 }
1186 return *leader;
1187 }
1188 poly_uint64 this_max_nunits = 1;
1189 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1190 &this_max_nunits,
1191 matches, npermutes, tree_size, bst_map);
1192 if (res)
1193 {
1194 res->max_nunits = this_max_nunits;
1195 vect_update_max_nunits (max_nunits, this_max_nunits);
1196 /* Keep a reference for the bst_map use. */
1197 res->refcnt++;
1198 }
1199 bst_map->put (stmts.copy (), res);
1200 return res;
1201 }
1202
1203 /* Recursively build an SLP tree starting from NODE.
1204 Fail (and return a value not equal to zero) if def-stmts are not
1205 isomorphic, require data permutation or are of unsupported types of
1206 operation. Otherwise, return 0.
1207 The value returned is the depth in the SLP tree where a mismatch
1208 was found. */
1209
1210 static slp_tree
1211 vect_build_slp_tree_2 (vec_info *vinfo,
1212 vec<stmt_vec_info> stmts, unsigned int group_size,
1213 poly_uint64 *max_nunits,
1214 bool *matches, unsigned *npermutes, unsigned *tree_size,
1215 scalar_stmts_to_slp_tree_map_t *bst_map)
1216 {
1217 unsigned nops, i, this_tree_size = 0;
1218 poly_uint64 this_max_nunits = *max_nunits;
1219 slp_tree node;
1220
1221 matches[0] = false;
1222
1223 stmt_vec_info stmt_info = stmts[0];
1224 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1225 nops = gimple_call_num_args (stmt);
1226 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1227 {
1228 nops = gimple_num_ops (stmt) - 1;
1229 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1230 nops++;
1231 }
1232 else if (is_a <gphi *> (stmt_info->stmt))
1233 nops = 0;
1234 else
1235 return NULL;
1236
1237 /* If the SLP node is a PHI (induction or reduction), terminate
1238 the recursion. */
1239 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1240 {
1241 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1242 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1243 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1244 max_nunits))
1245 return NULL;
1246
1247 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1248 /* Induction from different IVs is not supported. */
1249 if (def_type == vect_induction_def)
1250 {
1251 stmt_vec_info other_info;
1252 FOR_EACH_VEC_ELT (stmts, i, other_info)
1253 if (stmt_info != other_info)
1254 return NULL;
1255 }
1256 else if (def_type == vect_reduction_def
1257 || def_type == vect_double_reduction_def
1258 || def_type == vect_nested_cycle)
1259 {
1260 /* Else def types have to match. */
1261 stmt_vec_info other_info;
1262 FOR_EACH_VEC_ELT (stmts, i, other_info)
1263 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1264 return NULL;
1265 }
1266 else
1267 return NULL;
1268 (*tree_size)++;
1269 node = vect_create_new_slp_node (stmts, 0);
1270 SLP_TREE_VECTYPE (node) = vectype;
1271 return node;
1272 }
1273
1274
1275 bool two_operators = false;
1276 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1277 tree vectype = NULL_TREE;
1278 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1279 &this_max_nunits, matches, &two_operators,
1280 &vectype))
1281 return NULL;
1282
1283 /* If the SLP node is a load, terminate the recursion unless masked. */
1284 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1285 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1286 {
1287 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1288 {
1289 /* Masked load. */
1290 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1291 nops = 1;
1292 }
1293 else
1294 {
1295 *max_nunits = this_max_nunits;
1296 (*tree_size)++;
1297 node = vect_create_new_slp_node (stmts, 0);
1298 SLP_TREE_VECTYPE (node) = vectype;
1299 /* And compute the load permutation. Whether it is actually
1300 a permutation depends on the unrolling factor which is
1301 decided later. */
1302 vec<unsigned> load_permutation;
1303 int j;
1304 stmt_vec_info load_info;
1305 load_permutation.create (group_size);
1306 stmt_vec_info first_stmt_info
1307 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1308 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1309 {
1310 int load_place = vect_get_place_in_interleaving_chain
1311 (load_info, first_stmt_info);
1312 gcc_assert (load_place != -1);
1313 load_permutation.safe_push (load_place);
1314 }
1315 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1316 return node;
1317 }
1318 }
1319 else if (gimple_assign_single_p (stmt_info->stmt)
1320 && !gimple_vuse (stmt_info->stmt)
1321 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1322 {
1323 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1324 the same SSA name vector of a compatible type to vectype. */
1325 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1326 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1327 stmt_vec_info estmt_info;
1328 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1329 {
1330 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1331 tree bfref = gimple_assign_rhs1 (estmt);
1332 HOST_WIDE_INT lane;
1333 if (!known_eq (bit_field_size (bfref),
1334 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1335 || !constant_multiple_p (bit_field_offset (bfref),
1336 bit_field_size (bfref), &lane))
1337 {
1338 lperm.release ();
1339 return NULL;
1340 }
1341 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1342 }
1343 slp_tree vnode = vect_create_new_slp_node (vNULL);
1344 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1345 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1346 /* We are always building a permutation node even if it is an identity
1347 permute to shield the rest of the vectorizer from the odd node
1348 representing an actual vector without any scalar ops.
1349 ??? We could hide it completely with making the permute node
1350 external? */
1351 node = vect_create_new_slp_node (stmts, 1);
1352 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1353 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1354 SLP_TREE_VECTYPE (node) = vectype;
1355 SLP_TREE_CHILDREN (node).quick_push (vnode);
1356 return node;
1357 }
1358
1359 /* Get at the operands, verifying they are compatible. */
1360 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1361 slp_oprnd_info oprnd_info;
1362 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1363 {
1364 int res = vect_get_and_check_slp_defs (vinfo, swap[i],
1365 stmts, i, &oprnds_info);
1366 if (res != 0)
1367 matches[(res == -1) ? 0 : i] = false;
1368 if (!matches[0])
1369 break;
1370 }
1371 for (i = 0; i < group_size; ++i)
1372 if (!matches[i])
1373 {
1374 vect_free_oprnd_info (oprnds_info);
1375 return NULL;
1376 }
1377 swap = NULL;
1378
1379 auto_vec<slp_tree, 4> children;
1380
1381 stmt_info = stmts[0];
1382
1383 /* Create SLP_TREE nodes for the definition node/s. */
1384 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1385 {
1386 slp_tree child;
1387 unsigned int j;
1388
1389 if (oprnd_info->first_dt == vect_uninitialized_def)
1390 {
1391 /* COND_EXPR have one too many eventually if the condition
1392 is a SSA name. */
1393 gcc_assert (i == 3 && nops == 4);
1394 continue;
1395 }
1396
1397 if (oprnd_info->first_dt != vect_internal_def
1398 && oprnd_info->first_dt != vect_reduction_def
1399 && oprnd_info->first_dt != vect_induction_def)
1400 {
1401 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1402 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1403 oprnd_info->ops = vNULL;
1404 children.safe_push (invnode);
1405 continue;
1406 }
1407
1408 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1409 group_size, &this_max_nunits,
1410 matches, npermutes,
1411 &this_tree_size, bst_map)) != NULL)
1412 {
1413 oprnd_info->def_stmts = vNULL;
1414 children.safe_push (child);
1415 continue;
1416 }
1417
1418 /* If the SLP build for operand zero failed and operand zero
1419 and one can be commutated try that for the scalar stmts
1420 that failed the match. */
1421 if (i == 0
1422 /* A first scalar stmt mismatch signals a fatal mismatch. */
1423 && matches[0]
1424 /* ??? For COND_EXPRs we can swap the comparison operands
1425 as well as the arms under some constraints. */
1426 && nops == 2
1427 && oprnds_info[1]->first_dt == vect_internal_def
1428 && is_gimple_assign (stmt_info->stmt)
1429 /* Swapping operands for reductions breaks assumptions later on. */
1430 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1431 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1432 /* Do so only if the number of not successful permutes was nor more
1433 than a cut-ff as re-trying the recursive match on
1434 possibly each level of the tree would expose exponential
1435 behavior. */
1436 && *npermutes < 4)
1437 {
1438 /* See whether we can swap the matching or the non-matching
1439 stmt operands. */
1440 bool swap_not_matching = true;
1441 do
1442 {
1443 for (j = 0; j < group_size; ++j)
1444 {
1445 if (matches[j] != !swap_not_matching)
1446 continue;
1447 stmt_vec_info stmt_info = stmts[j];
1448 /* Verify if we can swap operands of this stmt. */
1449 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1450 if (!stmt
1451 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1452 {
1453 if (!swap_not_matching)
1454 goto fail;
1455 swap_not_matching = false;
1456 break;
1457 }
1458 }
1459 }
1460 while (j != group_size);
1461
1462 /* Swap mismatched definition stmts. */
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_NOTE, vect_location,
1465 "Re-trying with swapped operands of stmts ");
1466 for (j = 0; j < group_size; ++j)
1467 if (matches[j] == !swap_not_matching)
1468 {
1469 std::swap (oprnds_info[0]->def_stmts[j],
1470 oprnds_info[1]->def_stmts[j]);
1471 std::swap (oprnds_info[0]->ops[j],
1472 oprnds_info[1]->ops[j]);
1473 if (dump_enabled_p ())
1474 dump_printf (MSG_NOTE, "%d ", j);
1475 }
1476 if (dump_enabled_p ())
1477 dump_printf (MSG_NOTE, "\n");
1478 /* And try again with scratch 'matches' ... */
1479 bool *tem = XALLOCAVEC (bool, group_size);
1480 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1481 group_size, &this_max_nunits,
1482 tem, npermutes,
1483 &this_tree_size, bst_map)) != NULL)
1484 {
1485 oprnd_info->def_stmts = vNULL;
1486 children.safe_push (child);
1487 continue;
1488 }
1489 /* We do not undo the swapping here since it might still be
1490 the better order for the second operand in case we build
1491 the first one from scalars below. */
1492 ++*npermutes;
1493 }
1494 fail:
1495
1496 /* If the SLP build failed and we analyze a basic-block
1497 simply treat nodes we fail to build as externally defined
1498 (and thus build vectors from the scalar defs).
1499 The cost model will reject outright expensive cases.
1500 ??? This doesn't treat cases where permutation ultimatively
1501 fails (or we don't try permutation below). Ideally we'd
1502 even compute a permutation that will end up with the maximum
1503 SLP tree size... */
1504 if (is_a <bb_vec_info> (vinfo)
1505 /* ??? Rejecting patterns this way doesn't work. We'd have to
1506 do extra work to cancel the pattern so the uses see the
1507 scalar version. */
1508 && !is_pattern_stmt_p (stmt_info)
1509 && !oprnd_info->any_pattern)
1510 {
1511 /* But if there's a leading vector sized set of matching stmts
1512 fail here so we can split the group. This matches the condition
1513 vect_analyze_slp_instance uses. */
1514 /* ??? We might want to split here and combine the results to support
1515 multiple vector sizes better. */
1516 for (j = 0; j < group_size; ++j)
1517 if (!matches[j])
1518 break;
1519 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1520 {
1521 if (dump_enabled_p ())
1522 dump_printf_loc (MSG_NOTE, vect_location,
1523 "Building vector operands from scalars\n");
1524 this_tree_size++;
1525 child = vect_create_new_slp_node (oprnd_info->ops);
1526 children.safe_push (child);
1527 oprnd_info->ops = vNULL;
1528 oprnd_info->def_stmts = vNULL;
1529 continue;
1530 }
1531 }
1532
1533 gcc_assert (child == NULL);
1534 FOR_EACH_VEC_ELT (children, j, child)
1535 vect_free_slp_tree (child);
1536 vect_free_oprnd_info (oprnds_info);
1537 return NULL;
1538 }
1539
1540 vect_free_oprnd_info (oprnds_info);
1541
1542 /* If we have all children of a child built up from uniform scalars
1543 then just throw that away, causing it built up from scalars.
1544 The exception is the SLP node for the vector store. */
1545 if (is_a <bb_vec_info> (vinfo)
1546 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1547 /* ??? Rejecting patterns this way doesn't work. We'd have to
1548 do extra work to cancel the pattern so the uses see the
1549 scalar version. */
1550 && !is_pattern_stmt_p (stmt_info))
1551 {
1552 slp_tree child;
1553 unsigned j;
1554 FOR_EACH_VEC_ELT (children, j, child)
1555 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1556 || !vect_slp_tree_uniform_p (child))
1557 break;
1558 if (!child)
1559 {
1560 /* Roll back. */
1561 matches[0] = false;
1562 FOR_EACH_VEC_ELT (children, j, child)
1563 vect_free_slp_tree (child);
1564
1565 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_NOTE, vect_location,
1567 "Building parent vector operands from "
1568 "scalars instead\n");
1569 return NULL;
1570 }
1571 }
1572
1573 *tree_size += this_tree_size + 1;
1574 *max_nunits = this_max_nunits;
1575
1576 if (two_operators)
1577 {
1578 /* ??? We'd likely want to either cache in bst_map sth like
1579 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1580 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1581 explicit stmts to put in so the keying on 'stmts' doesn't
1582 work (but we have the same issue with nodes that use 'ops'). */
1583 slp_tree one = new _slp_tree;
1584 slp_tree two = new _slp_tree;
1585 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1586 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1587 SLP_TREE_VECTYPE (one) = vectype;
1588 SLP_TREE_VECTYPE (two) = vectype;
1589 SLP_TREE_CHILDREN (one).safe_splice (children);
1590 SLP_TREE_CHILDREN (two).safe_splice (children);
1591 slp_tree child;
1592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1593 child->refcnt++;
1594
1595 /* Here we record the original defs since this
1596 node represents the final lane configuration. */
1597 node = vect_create_new_slp_node (stmts, 2);
1598 SLP_TREE_VECTYPE (node) = vectype;
1599 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1600 SLP_TREE_CHILDREN (node).quick_push (one);
1601 SLP_TREE_CHILDREN (node).quick_push (two);
1602 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1603 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1604 enum tree_code ocode = ERROR_MARK;
1605 stmt_vec_info ostmt_info;
1606 unsigned j = 0;
1607 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1608 {
1609 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1610 if (gimple_assign_rhs_code (ostmt) != code0)
1611 {
1612 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1613 ocode = gimple_assign_rhs_code (ostmt);
1614 j = i;
1615 }
1616 else
1617 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1618 }
1619 SLP_TREE_CODE (one) = code0;
1620 SLP_TREE_CODE (two) = ocode;
1621 SLP_TREE_LANES (one) = stmts.length ();
1622 SLP_TREE_LANES (two) = stmts.length ();
1623 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1624 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1625 return node;
1626 }
1627
1628 node = vect_create_new_slp_node (stmts, nops);
1629 SLP_TREE_VECTYPE (node) = vectype;
1630 SLP_TREE_CHILDREN (node).splice (children);
1631 return node;
1632 }
1633
1634 /* Dump a single SLP tree NODE. */
1635
1636 static void
1637 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1638 slp_tree node)
1639 {
1640 unsigned i, j;
1641 slp_tree child;
1642 stmt_vec_info stmt_info;
1643 tree op;
1644
1645 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1646 dump_user_location_t user_loc = loc.get_user_location ();
1647 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1648 SLP_TREE_DEF_TYPE (node) == vect_external_def
1649 ? " (external)"
1650 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1651 ? " (constant)"
1652 : ""), node,
1653 estimated_poly_value (node->max_nunits), node->refcnt);
1654 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1655 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1656 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1657 else
1658 {
1659 dump_printf_loc (metadata, user_loc, "\t{ ");
1660 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1661 dump_printf (metadata, "%T%s ", op,
1662 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1663 dump_printf (metadata, "}\n");
1664 }
1665 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1666 {
1667 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1668 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1669 dump_printf (dump_kind, " %u", j);
1670 dump_printf (dump_kind, " }\n");
1671 }
1672 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1673 {
1674 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1675 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1676 dump_printf (dump_kind, " %u[%u]",
1677 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1678 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1679 dump_printf (dump_kind, " }\n");
1680 }
1681 if (SLP_TREE_CHILDREN (node).is_empty ())
1682 return;
1683 dump_printf_loc (metadata, user_loc, "\tchildren");
1684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1685 dump_printf (dump_kind, " %p", (void *)child);
1686 dump_printf (dump_kind, "\n");
1687 }
1688
1689 DEBUG_FUNCTION void
1690 debug (slp_tree node)
1691 {
1692 debug_dump_context ctx;
1693 vect_print_slp_tree (MSG_NOTE,
1694 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1695 node);
1696 }
1697
1698 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1699
1700 static void
1701 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1702 slp_tree node, hash_set<slp_tree> &visited)
1703 {
1704 unsigned i;
1705 slp_tree child;
1706
1707 if (visited.add (node))
1708 return;
1709
1710 vect_print_slp_tree (dump_kind, loc, node);
1711
1712 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1713 vect_print_slp_graph (dump_kind, loc, child, visited);
1714 }
1715
1716 static void
1717 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1718 slp_tree entry)
1719 {
1720 hash_set<slp_tree> visited;
1721 vect_print_slp_graph (dump_kind, loc, entry, visited);
1722 }
1723
1724 /* Mark the tree rooted at NODE with PURE_SLP. */
1725
1726 static void
1727 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1728 {
1729 int i;
1730 stmt_vec_info stmt_info;
1731 slp_tree child;
1732
1733 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1734 return;
1735
1736 if (visited.add (node))
1737 return;
1738
1739 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1740 STMT_SLP_TYPE (stmt_info) = pure_slp;
1741
1742 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1743 vect_mark_slp_stmts (child, visited);
1744 }
1745
1746 static void
1747 vect_mark_slp_stmts (slp_tree node)
1748 {
1749 hash_set<slp_tree> visited;
1750 vect_mark_slp_stmts (node, visited);
1751 }
1752
1753 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1754
1755 static void
1756 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1757 {
1758 int i;
1759 stmt_vec_info stmt_info;
1760 slp_tree child;
1761
1762 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1763 return;
1764
1765 if (visited.add (node))
1766 return;
1767
1768 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1769 {
1770 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1771 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1772 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1773 }
1774
1775 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1776 vect_mark_slp_stmts_relevant (child, visited);
1777 }
1778
1779 static void
1780 vect_mark_slp_stmts_relevant (slp_tree node)
1781 {
1782 hash_set<slp_tree> visited;
1783 vect_mark_slp_stmts_relevant (node, visited);
1784 }
1785
1786 /* Copy the SLP subtree rooted at NODE. */
1787
1788 static slp_tree
1789 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1790 {
1791 unsigned i;
1792
1793 bool existed_p;
1794 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1795 if (existed_p)
1796 return copy_ref;
1797
1798 copy_ref = new _slp_tree;
1799 slp_tree copy = copy_ref;
1800 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1801 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1802 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1803 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1804 copy->max_nunits = node->max_nunits;
1805 copy->refcnt = 0;
1806 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1807 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1808 if (SLP_TREE_SCALAR_OPS (node).exists ())
1809 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1810 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1811 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1812 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1813 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1814 if (SLP_TREE_CHILDREN (node).exists ())
1815 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1816 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1817
1818 slp_tree child;
1819 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1820 {
1821 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1822 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1823 }
1824 return copy;
1825 }
1826
1827 /* Rearrange the statements of NODE according to PERMUTATION. */
1828
1829 static void
1830 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1831 vec<unsigned> permutation,
1832 hash_set<slp_tree> &visited)
1833 {
1834 unsigned int i;
1835 slp_tree child;
1836
1837 if (visited.add (node))
1838 return;
1839
1840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1841 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1842
1843 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1844 {
1845 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1846 vec<stmt_vec_info> tmp_stmts;
1847 tmp_stmts.create (group_size);
1848 tmp_stmts.quick_grow (group_size);
1849 stmt_vec_info stmt_info;
1850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1851 tmp_stmts[permutation[i]] = stmt_info;
1852 SLP_TREE_SCALAR_STMTS (node).release ();
1853 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1854 }
1855 if (SLP_TREE_SCALAR_OPS (node).exists ())
1856 {
1857 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1858 vec<tree> tmp_ops;
1859 tmp_ops.create (group_size);
1860 tmp_ops.quick_grow (group_size);
1861 tree op;
1862 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1863 tmp_ops[permutation[i]] = op;
1864 SLP_TREE_SCALAR_OPS (node).release ();
1865 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1866 }
1867 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1868 {
1869 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1870 for (i = 0; i < group_size; ++i)
1871 SLP_TREE_LANE_PERMUTATION (node)[i].second
1872 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1873 }
1874 }
1875
1876
1877 /* Attempt to reorder stmts in a reduction chain so that we don't
1878 require any load permutation. Return true if that was possible,
1879 otherwise return false. */
1880
1881 static bool
1882 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1883 {
1884 unsigned int i, j;
1885 unsigned int lidx;
1886 slp_tree node, load;
1887
1888 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1889 return false;
1890
1891 /* Compare all the permutation sequences to the first one. We know
1892 that at least one load is permuted. */
1893 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1894 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1895 return false;
1896 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1897 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1898 {
1899 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1900 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1901 return false;
1902 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1903 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1904 return false;
1905 }
1906
1907 /* Check that the loads in the first sequence are different and there
1908 are no gaps between them and that there is an actual permutation. */
1909 bool any_permute = false;
1910 auto_sbitmap load_index (group_size);
1911 bitmap_clear (load_index);
1912 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1913 {
1914 if (lidx != i)
1915 any_permute = true;
1916 if (lidx >= group_size)
1917 return false;
1918 if (bitmap_bit_p (load_index, lidx))
1919 return false;
1920
1921 bitmap_set_bit (load_index, lidx);
1922 }
1923 if (!any_permute)
1924 return false;
1925 for (i = 0; i < group_size; i++)
1926 if (!bitmap_bit_p (load_index, i))
1927 return false;
1928
1929 /* This permutation is valid for reduction. Since the order of the
1930 statements in the nodes is not important unless they are memory
1931 accesses, we can rearrange the statements in all the nodes
1932 according to the order of the loads. */
1933
1934 /* We have to unshare the SLP tree we modify. */
1935 hash_map<slp_tree, slp_tree> map;
1936 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1937 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn));
1938 unshared->refcnt++;
1939 SLP_INSTANCE_TREE (slp_instn) = unshared;
1940 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1941 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1942 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1943
1944 /* Do the actual re-arrangement. */
1945 hash_set<slp_tree> visited;
1946 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1947 node->load_permutation, visited);
1948
1949 /* We are done, no actual permutations need to be generated. */
1950 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1951 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1952 {
1953 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1954 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1955 /* But we have to keep those permutations that are required because
1956 of handling of gaps. */
1957 if (known_eq (unrolling_factor, 1U)
1958 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1959 && DR_GROUP_GAP (first_stmt_info) == 0))
1960 SLP_TREE_LOAD_PERMUTATION (node).release ();
1961 else
1962 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1963 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1964 }
1965
1966 return true;
1967 }
1968
1969 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1970
1971 static void
1972 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1973 hash_set<slp_tree> &visited)
1974 {
1975 if (visited.add (node))
1976 return;
1977
1978 if (SLP_TREE_CHILDREN (node).length () == 0)
1979 {
1980 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1981 return;
1982 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1983 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1984 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1985 loads.safe_push (node);
1986 }
1987 else
1988 {
1989 unsigned i;
1990 slp_tree child;
1991 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1992 vect_gather_slp_loads (loads, child, visited);
1993 }
1994 }
1995
1996 static void
1997 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1998 {
1999 hash_set<slp_tree> visited;
2000 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
2001 }
2002
2003
2004 /* Find the last store in SLP INSTANCE. */
2005
2006 stmt_vec_info
2007 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2008 {
2009 stmt_vec_info last = NULL;
2010 stmt_vec_info stmt_vinfo;
2011
2012 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2013 {
2014 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2015 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2016 }
2017
2018 return last;
2019 }
2020
2021 /* Find the first stmt in NODE. */
2022
2023 stmt_vec_info
2024 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2025 {
2026 stmt_vec_info first = NULL;
2027 stmt_vec_info stmt_vinfo;
2028
2029 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2030 {
2031 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2032 if (!first
2033 || get_later_stmt (stmt_vinfo, first) == first)
2034 first = stmt_vinfo;
2035 }
2036
2037 return first;
2038 }
2039
2040 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2041 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2042 (also containing the first GROUP1_SIZE stmts, since stores are
2043 consecutive), the second containing the remainder.
2044 Return the first stmt in the second group. */
2045
2046 static stmt_vec_info
2047 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2048 {
2049 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2050 gcc_assert (group1_size > 0);
2051 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2052 gcc_assert (group2_size > 0);
2053 DR_GROUP_SIZE (first_vinfo) = group1_size;
2054
2055 stmt_vec_info stmt_info = first_vinfo;
2056 for (unsigned i = group1_size; i > 1; i--)
2057 {
2058 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2059 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2060 }
2061 /* STMT is now the last element of the first group. */
2062 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2063 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2064
2065 DR_GROUP_SIZE (group2) = group2_size;
2066 for (stmt_info = group2; stmt_info;
2067 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2068 {
2069 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2070 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2071 }
2072
2073 /* For the second group, the DR_GROUP_GAP is that before the original group,
2074 plus skipping over the first vector. */
2075 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2076
2077 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2078 DR_GROUP_GAP (first_vinfo) += group2_size;
2079
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2082 group1_size, group2_size);
2083
2084 return group2;
2085 }
2086
2087 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2088 statements and a vector of NUNITS elements. */
2089
2090 static poly_uint64
2091 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2092 {
2093 return exact_div (common_multiple (nunits, group_size), group_size);
2094 }
2095
2096 /* Analyze an SLP instance starting from a group of grouped stores. Call
2097 vect_build_slp_tree to build a tree of packed stmts if possible.
2098 Return FALSE if it's impossible to SLP any stmt in the loop. */
2099
2100 static bool
2101 vect_analyze_slp_instance (vec_info *vinfo,
2102 scalar_stmts_to_slp_tree_map_t *bst_map,
2103 stmt_vec_info stmt_info, unsigned max_tree_size)
2104 {
2105 slp_instance new_instance;
2106 slp_tree node;
2107 unsigned int group_size;
2108 tree vectype, scalar_type = NULL_TREE;
2109 unsigned int i;
2110 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2111 vec<stmt_vec_info> scalar_stmts;
2112 bool constructor = false;
2113
2114 if (is_a <bb_vec_info> (vinfo))
2115 vect_location = stmt_info->stmt;
2116 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117 {
2118 scalar_type = TREE_TYPE (DR_REF (dr));
2119 group_size = DR_GROUP_SIZE (stmt_info);
2120 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2121 }
2122 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2123 {
2124 gcc_assert (is_a <loop_vec_info> (vinfo));
2125 vectype = STMT_VINFO_VECTYPE (stmt_info);
2126 group_size = REDUC_GROUP_SIZE (stmt_info);
2127 }
2128 else if (is_gimple_assign (stmt_info->stmt)
2129 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2130 {
2131 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2132 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2133 constructor = true;
2134 }
2135 else
2136 {
2137 gcc_assert (is_a <loop_vec_info> (vinfo));
2138 vectype = STMT_VINFO_VECTYPE (stmt_info);
2139 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2140 }
2141
2142 if (!vectype)
2143 {
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146 "Build SLP failed: unsupported data-type %T\n",
2147 scalar_type);
2148
2149 return false;
2150 }
2151 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2152
2153 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2154 scalar_stmts.create (group_size);
2155 stmt_vec_info next_info = stmt_info;
2156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2157 {
2158 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2159 while (next_info)
2160 {
2161 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2162 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2163 }
2164 }
2165 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2166 {
2167 /* Collect the reduction stmts and store them in
2168 SLP_TREE_SCALAR_STMTS. */
2169 while (next_info)
2170 {
2171 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2172 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2173 }
2174 /* Mark the first element of the reduction chain as reduction to properly
2175 transform the node. In the reduction analysis phase only the last
2176 element of the chain is marked as reduction. */
2177 STMT_VINFO_DEF_TYPE (stmt_info)
2178 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2179 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2180 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2181 }
2182 else if (constructor)
2183 {
2184 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2185 tree val;
2186 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2187 {
2188 if (TREE_CODE (val) == SSA_NAME)
2189 {
2190 gimple* def = SSA_NAME_DEF_STMT (val);
2191 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2192 /* Value is defined in another basic block. */
2193 if (!def_info)
2194 return false;
2195 def_info = vect_stmt_to_vectorize (def_info);
2196 scalar_stmts.safe_push (def_info);
2197 }
2198 else
2199 return false;
2200 }
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_NOTE, vect_location,
2203 "Analyzing vectorizable constructor: %G\n",
2204 stmt_info->stmt);
2205 }
2206 else
2207 {
2208 /* Collect reduction statements. */
2209 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2210 for (i = 0; reductions.iterate (i, &next_info); i++)
2211 scalar_stmts.safe_push (next_info);
2212 }
2213
2214 if (dump_enabled_p ())
2215 {
2216 dump_printf_loc (MSG_NOTE, vect_location,
2217 "Starting SLP discovery for\n");
2218 for (i = 0; i < scalar_stmts.length (); ++i)
2219 dump_printf_loc (MSG_NOTE, vect_location,
2220 " %G", scalar_stmts[i]->stmt);
2221 }
2222
2223 /* Build the tree for the SLP instance. */
2224 bool *matches = XALLOCAVEC (bool, group_size);
2225 unsigned npermutes = 0;
2226 poly_uint64 max_nunits = nunits;
2227 unsigned tree_size = 0;
2228 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2229 &max_nunits, matches, &npermutes,
2230 &tree_size, bst_map);
2231 if (node != NULL)
2232 {
2233 /* Calculate the unrolling factor based on the smallest type. */
2234 poly_uint64 unrolling_factor
2235 = calculate_unrolling_factor (max_nunits, group_size);
2236
2237 if (maybe_ne (unrolling_factor, 1U)
2238 && is_a <bb_vec_info> (vinfo))
2239 {
2240 unsigned HOST_WIDE_INT const_max_nunits;
2241 if (!max_nunits.is_constant (&const_max_nunits)
2242 || const_max_nunits > group_size)
2243 {
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Build SLP failed: store group "
2247 "size not a multiple of the vector size "
2248 "in basic block SLP\n");
2249 vect_free_slp_tree (node);
2250 return false;
2251 }
2252 /* Fatal mismatch. */
2253 if (dump_enabled_p ())
2254 dump_printf_loc (MSG_NOTE, vect_location,
2255 "SLP discovery succeeded but node needs "
2256 "splitting\n");
2257 matches[0] = true;
2258 matches[group_size / const_max_nunits * const_max_nunits] = false;
2259 vect_free_slp_tree (node);
2260 }
2261 else
2262 {
2263 /* Create a new SLP instance. */
2264 new_instance = XNEW (class _slp_instance);
2265 SLP_INSTANCE_TREE (new_instance) = node;
2266 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2267 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2268 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2269 new_instance->reduc_phis = NULL;
2270 new_instance->cost_vec = vNULL;
2271 new_instance->subgraph_entries = vNULL;
2272
2273 vect_gather_slp_loads (new_instance, node);
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE, vect_location,
2276 "SLP size %u vs. limit %u.\n",
2277 tree_size, max_tree_size);
2278
2279 /* Check whether any load is possibly permuted. */
2280 slp_tree load_node;
2281 bool loads_permuted = false;
2282 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2283 {
2284 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2285 continue;
2286 unsigned j;
2287 stmt_vec_info load_info;
2288 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2289 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2290 {
2291 loads_permuted = true;
2292 break;
2293 }
2294 }
2295
2296 /* If the loads and stores can be handled with load/store-lane
2297 instructions do not generate this SLP instance. */
2298 if (is_a <loop_vec_info> (vinfo)
2299 && loads_permuted
2300 && dr && vect_store_lanes_supported (vectype, group_size, false))
2301 {
2302 slp_tree load_node;
2303 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2304 {
2305 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2306 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2307 /* Use SLP for strided accesses (or if we can't load-lanes). */
2308 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2309 || ! vect_load_lanes_supported
2310 (STMT_VINFO_VECTYPE (stmt_vinfo),
2311 DR_GROUP_SIZE (stmt_vinfo), false))
2312 break;
2313 }
2314 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2315 {
2316 if (dump_enabled_p ())
2317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318 "Built SLP cancelled: can use "
2319 "load/store-lanes\n");
2320 vect_free_slp_instance (new_instance);
2321 return false;
2322 }
2323 }
2324
2325 /* If this is a reduction chain with a conversion in front
2326 amend the SLP tree with a node for that. */
2327 if (!dr
2328 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2329 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2330 {
2331 /* Get at the conversion stmt - we know it's the single use
2332 of the last stmt of the reduction chain. */
2333 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2334 use_operand_p use_p;
2335 gimple *use_stmt;
2336 bool r = single_imm_use (gimple_assign_lhs (tem),
2337 &use_p, &use_stmt);
2338 gcc_assert (r);
2339 next_info = vinfo->lookup_stmt (use_stmt);
2340 next_info = vect_stmt_to_vectorize (next_info);
2341 scalar_stmts = vNULL;
2342 scalar_stmts.create (group_size);
2343 for (unsigned i = 0; i < group_size; ++i)
2344 scalar_stmts.quick_push (next_info);
2345 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2346 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2347 SLP_TREE_CHILDREN (conv).quick_push (node);
2348 SLP_INSTANCE_TREE (new_instance) = conv;
2349 /* We also have to fake this conversion stmt as SLP reduction
2350 group so we don't have to mess with too much code
2351 elsewhere. */
2352 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2353 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2354 }
2355
2356 vinfo->slp_instances.safe_push (new_instance);
2357
2358 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2359 the number of scalar stmts in the root in a few places.
2360 Verify that assumption holds. */
2361 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2362 .length () == group_size);
2363
2364 if (dump_enabled_p ())
2365 {
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "Final SLP tree for instance:\n");
2368 vect_print_slp_graph (MSG_NOTE, vect_location,
2369 SLP_INSTANCE_TREE (new_instance));
2370 }
2371
2372 return true;
2373 }
2374 }
2375 else
2376 {
2377 /* Failed to SLP. */
2378 /* Free the allocated memory. */
2379 scalar_stmts.release ();
2380 }
2381
2382 /* For basic block SLP, try to break the group up into multiples of the
2383 vector size. */
2384 unsigned HOST_WIDE_INT const_nunits;
2385 if (is_a <bb_vec_info> (vinfo)
2386 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2387 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))
2388 && nunits.is_constant (&const_nunits))
2389 {
2390 /* We consider breaking the group only on VF boundaries from the existing
2391 start. */
2392 for (i = 0; i < group_size; i++)
2393 if (!matches[i]) break;
2394
2395 if (i >= const_nunits && i < group_size)
2396 {
2397 /* Split into two groups at the first vector boundary before i. */
2398 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2399 unsigned group1_size = i & ~(const_nunits - 1);
2400
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE, vect_location,
2403 "Splitting SLP group at stmt %u\n", i);
2404 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2405 group1_size);
2406 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2407 max_tree_size);
2408 /* If the first non-match was in the middle of a vector,
2409 skip the rest of that vector. */
2410 if (group1_size < i)
2411 {
2412 i = group1_size + const_nunits;
2413 if (i < group_size)
2414 rest = vect_split_slp_store_group (rest, const_nunits);
2415 }
2416 if (i < group_size)
2417 res |= vect_analyze_slp_instance (vinfo, bst_map,
2418 rest, max_tree_size);
2419 return res;
2420 }
2421 /* Even though the first vector did not all match, we might be able to SLP
2422 (some) of the remainder. FORNOW ignore this possibility. */
2423 }
2424
2425 /* Failed to SLP. */
2426 if (dump_enabled_p ())
2427 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2428 return false;
2429 }
2430
2431
2432 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2433 trees of packed scalar stmts if SLP is possible. */
2434
2435 opt_result
2436 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2437 {
2438 unsigned int i;
2439 stmt_vec_info first_element;
2440
2441 DUMP_VECT_SCOPE ("vect_analyze_slp");
2442
2443 scalar_stmts_to_slp_tree_map_t *bst_map
2444 = new scalar_stmts_to_slp_tree_map_t ();
2445
2446 /* Find SLP sequences starting from groups of grouped stores. */
2447 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2448 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2449
2450 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2451 {
2452 if (loop_vinfo->reduction_chains.length () > 0)
2453 {
2454 /* Find SLP sequences starting from reduction chains. */
2455 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2456 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2457 max_tree_size))
2458 {
2459 /* Dissolve reduction chain group. */
2460 stmt_vec_info vinfo = first_element;
2461 stmt_vec_info last = NULL;
2462 while (vinfo)
2463 {
2464 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2465 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2466 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2467 last = vinfo;
2468 vinfo = next;
2469 }
2470 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2471 /* It can be still vectorized as part of an SLP reduction. */
2472 loop_vinfo->reductions.safe_push (last);
2473 }
2474 }
2475
2476 /* Find SLP sequences starting from groups of reductions. */
2477 if (loop_vinfo->reductions.length () > 1)
2478 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2479 max_tree_size);
2480 }
2481
2482 /* The map keeps a reference on SLP nodes built, release that. */
2483 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2484 it != bst_map->end (); ++it)
2485 if ((*it).second)
2486 vect_free_slp_tree ((*it).second);
2487 delete bst_map;
2488
2489 /* Optimize permutations in SLP reductions. */
2490 slp_instance instance;
2491 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2492 {
2493 slp_tree node = SLP_INSTANCE_TREE (instance);
2494 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2495 /* Reduction (there are no data-refs in the root).
2496 In reduction chain the order of the loads is not important. */
2497 if (!STMT_VINFO_DATA_REF (stmt_info)
2498 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2499 vect_attempt_slp_rearrange_stmts (instance);
2500 }
2501
2502 /* Gather all loads in the SLP graph. */
2503 hash_set<slp_tree> visited;
2504 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2505 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2506 visited);
2507
2508 return opt_result::success ();
2509 }
2510
2511 void
2512 vect_optimize_slp (vec_info *vinfo)
2513 {
2514 slp_tree node;
2515 unsigned i;
2516 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2517 {
2518 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2519 continue;
2520
2521 /* In basic block vectorization we allow any subchain of an interleaving
2522 chain.
2523 FORNOW: not in loop SLP because of realignment complications. */
2524 if (is_a <bb_vec_info> (vinfo))
2525 {
2526 bool subchain_p = true;
2527 stmt_vec_info next_load_info = NULL;
2528 stmt_vec_info load_info;
2529 unsigned j;
2530 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2531 {
2532 if (j != 0
2533 && (next_load_info != load_info
2534 || DR_GROUP_GAP (load_info) != 1))
2535 {
2536 subchain_p = false;
2537 break;
2538 }
2539 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2540 }
2541 if (subchain_p)
2542 {
2543 SLP_TREE_LOAD_PERMUTATION (node).release ();
2544 continue;
2545 }
2546 }
2547 else
2548 {
2549 stmt_vec_info load_info;
2550 bool this_load_permuted = false;
2551 unsigned j;
2552 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2553 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2554 {
2555 this_load_permuted = true;
2556 break;
2557 }
2558 stmt_vec_info first_stmt_info
2559 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2560 if (!this_load_permuted
2561 /* The load requires permutation when unrolling exposes
2562 a gap either because the group is larger than the SLP
2563 group-size or because there is a gap between the groups. */
2564 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2565 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2566 && DR_GROUP_GAP (first_stmt_info) == 0)))
2567 {
2568 SLP_TREE_LOAD_PERMUTATION (node).release ();
2569 continue;
2570 }
2571 }
2572 }
2573 }
2574
2575
2576 /* For each possible SLP instance decide whether to SLP it and calculate overall
2577 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2578 least one instance. */
2579
2580 bool
2581 vect_make_slp_decision (loop_vec_info loop_vinfo)
2582 {
2583 unsigned int i;
2584 poly_uint64 unrolling_factor = 1;
2585 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2586 slp_instance instance;
2587 int decided_to_slp = 0;
2588
2589 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2590
2591 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2592 {
2593 /* FORNOW: SLP if you can. */
2594 /* All unroll factors have the form:
2595
2596 GET_MODE_SIZE (vinfo->vector_mode) * X
2597
2598 for some rational X, so they must have a common multiple. */
2599 unrolling_factor
2600 = force_common_multiple (unrolling_factor,
2601 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2602
2603 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2604 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2605 loop-based vectorization. Such stmts will be marked as HYBRID. */
2606 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2607 decided_to_slp++;
2608 }
2609
2610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2611
2612 if (decided_to_slp && dump_enabled_p ())
2613 {
2614 dump_printf_loc (MSG_NOTE, vect_location,
2615 "Decided to SLP %d instances. Unrolling factor ",
2616 decided_to_slp);
2617 dump_dec (MSG_NOTE, unrolling_factor);
2618 dump_printf (MSG_NOTE, "\n");
2619 }
2620
2621 return (decided_to_slp > 0);
2622 }
2623
2624 /* Private data for vect_detect_hybrid_slp. */
2625 struct vdhs_data
2626 {
2627 loop_vec_info loop_vinfo;
2628 vec<stmt_vec_info> *worklist;
2629 };
2630
2631 /* Walker for walk_gimple_op. */
2632
2633 static tree
2634 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2635 {
2636 walk_stmt_info *wi = (walk_stmt_info *)data;
2637 vdhs_data *dat = (vdhs_data *)wi->info;
2638
2639 if (wi->is_lhs)
2640 return NULL_TREE;
2641
2642 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2643 if (!def_stmt_info)
2644 return NULL_TREE;
2645 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2646 if (PURE_SLP_STMT (def_stmt_info))
2647 {
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2650 def_stmt_info->stmt);
2651 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2652 dat->worklist->safe_push (def_stmt_info);
2653 }
2654
2655 return NULL_TREE;
2656 }
2657
2658 /* Find stmts that must be both vectorized and SLPed. */
2659
2660 void
2661 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2662 {
2663 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2664
2665 /* All stmts participating in SLP are marked pure_slp, all other
2666 stmts are loop_vect.
2667 First collect all loop_vect stmts into a worklist. */
2668 auto_vec<stmt_vec_info> worklist;
2669 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2670 {
2671 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2672 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2673 gsi_next (&gsi))
2674 {
2675 gphi *phi = gsi.phi ();
2676 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2677 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2678 worklist.safe_push (stmt_info);
2679 }
2680 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2681 gsi_next (&gsi))
2682 {
2683 gimple *stmt = gsi_stmt (gsi);
2684 if (is_gimple_debug (stmt))
2685 continue;
2686 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2687 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2688 {
2689 for (gimple_stmt_iterator gsi2
2690 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2691 !gsi_end_p (gsi2); gsi_next (&gsi2))
2692 {
2693 stmt_vec_info patt_info
2694 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2695 if (!STMT_SLP_TYPE (patt_info)
2696 && STMT_VINFO_RELEVANT (patt_info))
2697 worklist.safe_push (patt_info);
2698 }
2699 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2700 }
2701 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2702 worklist.safe_push (stmt_info);
2703 }
2704 }
2705
2706 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2707 mark any SLP vectorized stmt as hybrid.
2708 ??? We're visiting def stmts N times (once for each non-SLP and
2709 once for each hybrid-SLP use). */
2710 walk_stmt_info wi;
2711 vdhs_data dat;
2712 dat.worklist = &worklist;
2713 dat.loop_vinfo = loop_vinfo;
2714 memset (&wi, 0, sizeof (wi));
2715 wi.info = (void *)&dat;
2716 while (!worklist.is_empty ())
2717 {
2718 stmt_vec_info stmt_info = worklist.pop ();
2719 /* Since SSA operands are not set up for pattern stmts we need
2720 to use walk_gimple_op. */
2721 wi.is_lhs = 0;
2722 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2723 }
2724 }
2725
2726
2727 /* Initialize a bb_vec_info struct for the statements between
2728 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2729
2730 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2731 gimple_stmt_iterator region_end_in,
2732 vec_info_shared *shared)
2733 : vec_info (vec_info::bb, init_cost (NULL), shared),
2734 bb (gsi_bb (region_begin_in)),
2735 region_begin (region_begin_in),
2736 region_end (region_end_in)
2737 {
2738 for (gimple *stmt : this->region_stmts ())
2739 {
2740 gimple_set_uid (stmt, 0);
2741 if (is_gimple_debug (stmt))
2742 continue;
2743 add_stmt (stmt);
2744 }
2745
2746 bb->aux = this;
2747 }
2748
2749
2750 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2751 stmts in the basic block. */
2752
2753 _bb_vec_info::~_bb_vec_info ()
2754 {
2755 for (gimple *stmt : this->region_stmts ())
2756 /* Reset region marker. */
2757 gimple_set_uid (stmt, -1);
2758
2759 bb->aux = NULL;
2760 }
2761
2762 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2763 given then that child nodes have already been processed, and that
2764 their def types currently match their SLP node's def type. */
2765
2766 static bool
2767 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2768 slp_instance node_instance,
2769 stmt_vector_for_cost *cost_vec)
2770 {
2771 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2772 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2773
2774 /* Calculate the number of vector statements to be created for the
2775 scalar stmts in this node. For SLP reductions it is equal to the
2776 number of vector statements in the children (which has already been
2777 calculated by the recursive call). Otherwise it is the number of
2778 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2779 VF divided by the number of elements in a vector. */
2780 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2781 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2782 {
2783 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2784 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2785 {
2786 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2787 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2788 break;
2789 }
2790 }
2791 else
2792 {
2793 poly_uint64 vf;
2794 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2795 vf = loop_vinfo->vectorization_factor;
2796 else
2797 vf = 1;
2798 unsigned int group_size = SLP_TREE_LANES (node);
2799 tree vectype = SLP_TREE_VECTYPE (node);
2800 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2801 = vect_get_num_vectors (vf * group_size, vectype);
2802 }
2803
2804 /* Handle purely internal nodes. */
2805 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2806 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2807
2808 if (is_a <bb_vec_info> (vinfo)
2809 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
2810 return false;
2811
2812 bool dummy;
2813 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2814 node, node_instance, cost_vec);
2815 }
2816
2817 /* Try to build NODE from scalars, returning true on success.
2818 NODE_INSTANCE is the SLP instance that contains NODE. */
2819
2820 static bool
2821 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2822 slp_instance node_instance)
2823 {
2824 stmt_vec_info stmt_info;
2825 unsigned int i;
2826
2827 if (!is_a <bb_vec_info> (vinfo)
2828 || node == SLP_INSTANCE_TREE (node_instance)
2829 || !SLP_TREE_SCALAR_STMTS (node).exists ()
2830 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2831 return false;
2832
2833 if (dump_enabled_p ())
2834 dump_printf_loc (MSG_NOTE, vect_location,
2835 "Building vector operands from scalars instead\n");
2836
2837 /* Don't remove and free the child nodes here, since they could be
2838 referenced by other structures. The analysis and scheduling phases
2839 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2840 unsigned int group_size = SLP_TREE_LANES (node);
2841 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2842 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
2843 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2844 {
2845 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2846 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2847 }
2848 return true;
2849 }
2850
2851 /* Compute the prologue cost for invariant or constant operands represented
2852 by NODE. */
2853
2854 static void
2855 vect_prologue_cost_for_slp (slp_tree node,
2856 stmt_vector_for_cost *cost_vec)
2857 {
2858 /* There's a special case of an existing vector, that costs nothing. */
2859 if (SLP_TREE_SCALAR_OPS (node).length () == 0
2860 && !SLP_TREE_VEC_DEFS (node).is_empty ())
2861 return;
2862 /* Without looking at the actual initializer a vector of
2863 constants can be implemented as load from the constant pool.
2864 When all elements are the same we can use a splat. */
2865 tree vectype = SLP_TREE_VECTYPE (node);
2866 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2867 unsigned num_vects_to_check;
2868 unsigned HOST_WIDE_INT const_nunits;
2869 unsigned nelt_limit;
2870 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2871 && ! multiple_p (const_nunits, group_size))
2872 {
2873 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2874 nelt_limit = const_nunits;
2875 }
2876 else
2877 {
2878 /* If either the vector has variable length or the vectors
2879 are composed of repeated whole groups we only need to
2880 cost construction once. All vectors will be the same. */
2881 num_vects_to_check = 1;
2882 nelt_limit = group_size;
2883 }
2884 tree elt = NULL_TREE;
2885 unsigned nelt = 0;
2886 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2887 {
2888 unsigned si = j % group_size;
2889 if (nelt == 0)
2890 elt = SLP_TREE_SCALAR_OPS (node)[si];
2891 /* ??? We're just tracking whether all operands of a single
2892 vector initializer are the same, ideally we'd check if
2893 we emitted the same one already. */
2894 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2895 elt = NULL_TREE;
2896 nelt++;
2897 if (nelt == nelt_limit)
2898 {
2899 record_stmt_cost (cost_vec, 1,
2900 SLP_TREE_DEF_TYPE (node) == vect_external_def
2901 ? (elt ? scalar_to_vec : vec_construct)
2902 : vector_load,
2903 NULL, vectype, 0, vect_prologue);
2904 nelt = 0;
2905 }
2906 }
2907 }
2908
2909 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2910 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2911
2912 Return true if the operations are supported. */
2913
2914 static bool
2915 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2916 slp_instance node_instance,
2917 hash_set<slp_tree> &visited,
2918 hash_set<slp_tree> &lvisited,
2919 stmt_vector_for_cost *cost_vec)
2920 {
2921 int i, j;
2922 slp_tree child;
2923
2924 /* Assume we can code-generate all invariants. */
2925 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2926 return true;
2927
2928 /* If we already analyzed the exact same set of scalar stmts we're done.
2929 We share the generated vector stmts for those.
2930 The SLP graph is acyclic so not caching whether we failed or succeeded
2931 doesn't result in any issue since we throw away the lvisited set
2932 when we fail. */
2933 if (visited.contains (node)
2934 || lvisited.contains (node))
2935 return true;
2936
2937 bool res = true;
2938 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2939 {
2940 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
2941 visited, lvisited, cost_vec);
2942 if (!res)
2943 break;
2944 }
2945
2946 if (res)
2947 {
2948 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2949 cost_vec);
2950 if (res)
2951 lvisited.add (node);
2952 }
2953
2954 /* When the node can be vectorized cost invariant nodes it references.
2955 This is not done in DFS order to allow the refering node
2956 vectorizable_* calls to nail down the invariant nodes vector type
2957 and possibly unshare it if it needs a different vector type than
2958 other referrers. */
2959 if (res)
2960 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2961 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2962 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2963 /* Perform usual caching, note code-generation still
2964 code-gens these nodes multiple times but we expect
2965 to CSE them later. */
2966 && !visited.contains (child)
2967 && !lvisited.add (child))
2968 {
2969 /* ??? After auditing more code paths make a "default"
2970 and push the vector type from NODE to all children
2971 if it is not already set. */
2972 /* Compute the number of vectors to be generated. */
2973 tree vector_type = SLP_TREE_VECTYPE (child);
2974 if (!vector_type)
2975 {
2976 /* For shifts with a scalar argument we don't need
2977 to cost or code-generate anything.
2978 ??? Represent this more explicitely. */
2979 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2980 == shift_vec_info_type)
2981 && j == 1);
2982 continue;
2983 }
2984 unsigned group_size = SLP_TREE_LANES (child);
2985 poly_uint64 vf = 1;
2986 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2987 vf = loop_vinfo->vectorization_factor;
2988 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2989 = vect_get_num_vectors (vf * group_size, vector_type);
2990 /* And cost them. */
2991 vect_prologue_cost_for_slp (child, cost_vec);
2992 }
2993
2994 /* If this node or any of its children can't be vectorized, try pruning
2995 the tree here rather than felling the whole thing. */
2996 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2997 {
2998 /* We'll need to revisit this for invariant costing and number
2999 of vectorized stmt setting. */
3000 res = true;
3001 }
3002
3003 return res;
3004 }
3005
3006
3007 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3008 region and that can be vectorized using vectorizable_live_operation
3009 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3010 scalar code computing it to be retained. */
3011
3012 static void
3013 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3014 slp_instance instance,
3015 stmt_vector_for_cost *cost_vec,
3016 hash_set<stmt_vec_info> &svisited)
3017 {
3018 unsigned i;
3019 stmt_vec_info stmt_info;
3020 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3021 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3022 {
3023 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3024 if (svisited.contains (orig_stmt_info))
3025 continue;
3026 bool mark_visited = true;
3027 gimple *orig_stmt = orig_stmt_info->stmt;
3028 ssa_op_iter op_iter;
3029 def_operand_p def_p;
3030 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3031 {
3032 imm_use_iterator use_iter;
3033 gimple *use_stmt;
3034 stmt_vec_info use_stmt_info;
3035 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3036 if (!is_gimple_debug (use_stmt))
3037 {
3038 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3039 if (!use_stmt_info
3040 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3041 {
3042 STMT_VINFO_LIVE_P (stmt_info) = true;
3043 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3044 NULL, node, instance, i,
3045 false, cost_vec))
3046 /* ??? So we know we can vectorize the live stmt
3047 from one SLP node. If we cannot do so from all
3048 or none consistently we'd have to record which
3049 SLP node (and lane) we want to use for the live
3050 operation. So make sure we can code-generate
3051 from all nodes. */
3052 mark_visited = false;
3053 else
3054 STMT_VINFO_LIVE_P (stmt_info) = false;
3055 BREAK_FROM_IMM_USE_STMT (use_iter);
3056 }
3057 }
3058 /* We have to verify whether we can insert the lane extract
3059 before all uses. The following is a conservative approximation.
3060 We cannot put this into vectorizable_live_operation because
3061 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3062 doesn't work.
3063 Note that while the fact that we emit code for loads at the
3064 first load should make this a non-problem leafs we construct
3065 from scalars are vectorized after the last scalar def.
3066 ??? If we'd actually compute the insert location during
3067 analysis we could use sth less conservative than the last
3068 scalar stmt in the node for the dominance check. */
3069 /* ??? What remains is "live" uses in vector CTORs in the same
3070 SLP graph which is where those uses can end up code-generated
3071 right after their definition instead of close to their original
3072 use. But that would restrict us to code-generate lane-extracts
3073 from the latest stmt in a node. So we compensate for this
3074 during code-generation, simply not replacing uses for those
3075 hopefully rare cases. */
3076 if (STMT_VINFO_LIVE_P (stmt_info))
3077 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3078 if (!is_gimple_debug (use_stmt)
3079 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
3080 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3081 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
3082 {
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 "Cannot determine insertion place for "
3086 "lane extract\n");
3087 STMT_VINFO_LIVE_P (stmt_info) = false;
3088 mark_visited = true;
3089 }
3090 }
3091 if (mark_visited)
3092 svisited.add (orig_stmt_info);
3093 }
3094
3095 slp_tree child;
3096 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3097 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3098 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
3099 cost_vec, svisited);
3100 }
3101
3102 /* Analyze statements in SLP instances of VINFO. Return true if the
3103 operations are supported. */
3104
3105 bool
3106 vect_slp_analyze_operations (vec_info *vinfo)
3107 {
3108 slp_instance instance;
3109 int i;
3110
3111 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3112
3113 hash_set<slp_tree> visited;
3114 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
3115 {
3116 hash_set<slp_tree> lvisited;
3117 stmt_vector_for_cost cost_vec;
3118 cost_vec.create (2);
3119 if (is_a <bb_vec_info> (vinfo))
3120 vect_location = instance->location ();
3121 if (!vect_slp_analyze_node_operations (vinfo,
3122 SLP_INSTANCE_TREE (instance),
3123 instance, visited, lvisited,
3124 &cost_vec)
3125 /* Instances with a root stmt require vectorized defs for the
3126 SLP tree root. */
3127 || (SLP_INSTANCE_ROOT_STMT (instance)
3128 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
3129 != vect_internal_def)))
3130 {
3131 slp_tree node = SLP_INSTANCE_TREE (instance);
3132 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3133 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_NOTE, vect_location,
3135 "removing SLP instance operations starting from: %G",
3136 stmt_info->stmt);
3137 vect_free_slp_instance (instance);
3138 vinfo->slp_instances.ordered_remove (i);
3139 cost_vec.release ();
3140 }
3141 else
3142 {
3143 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3144 x != lvisited.end(); ++x)
3145 visited.add (*x);
3146 i++;
3147
3148 /* For BB vectorization remember the SLP graph entry
3149 cost for later. */
3150 if (is_a <bb_vec_info> (vinfo))
3151 instance->cost_vec = cost_vec;
3152 else
3153 {
3154 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3155 cost_vec.release ();
3156 }
3157 }
3158 }
3159
3160 /* Compute vectorizable live stmts. */
3161 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3162 {
3163 hash_set<stmt_vec_info> svisited;
3164 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
3165 {
3166 vect_location = instance->location ();
3167 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
3168 instance, &instance->cost_vec, svisited);
3169 }
3170 }
3171
3172 return !vinfo->slp_instances.is_empty ();
3173 }
3174
3175 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3176 closing the eventual chain. */
3177
3178 static slp_instance
3179 get_ultimate_leader (slp_instance instance,
3180 hash_map<slp_instance, slp_instance> &instance_leader)
3181 {
3182 auto_vec<slp_instance *, 8> chain;
3183 slp_instance *tem;
3184 while (*(tem = instance_leader.get (instance)) != instance)
3185 {
3186 chain.safe_push (tem);
3187 instance = *tem;
3188 }
3189 while (!chain.is_empty ())
3190 *chain.pop () = instance;
3191 return instance;
3192 }
3193
3194 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3195
3196 static void
3197 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
3198 slp_instance instance, slp_tree node,
3199 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
3200 hash_map<slp_instance, slp_instance> &instance_leader)
3201 {
3202 stmt_vec_info stmt_info;
3203 unsigned i;
3204 bool all = true;
3205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3206 {
3207 bool existed_p;
3208 slp_instance &stmt_instance
3209 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
3210 if (!existed_p)
3211 all = false;
3212 else if (stmt_instance != instance)
3213 {
3214 /* If we're running into a previously marked stmt make us the
3215 leader of the current ultimate leader. This keeps the
3216 leader chain acyclic and works even when the current instance
3217 connects two previously independent graph parts. */
3218 slp_instance stmt_leader
3219 = get_ultimate_leader (stmt_instance, instance_leader);
3220 if (stmt_leader != instance)
3221 instance_leader.put (stmt_leader, instance);
3222 }
3223 stmt_instance = instance;
3224 }
3225 /* If not all stmts had been visited we have to recurse on children. */
3226 if (all)
3227 return;
3228
3229 slp_tree child;
3230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3231 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3232 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
3233 instance_leader);
3234 }
3235
3236 /* Partition the SLP graph into pieces that can be costed independently. */
3237
3238 static void
3239 vect_bb_partition_graph (bb_vec_info bb_vinfo)
3240 {
3241 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3242
3243 /* First walk the SLP graph assigning each involved scalar stmt a
3244 corresponding SLP graph entry and upon visiting a previously
3245 marked stmt, make the stmts leader the current SLP graph entry. */
3246 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
3247 hash_map<slp_instance, slp_instance> instance_leader;
3248 slp_instance instance;
3249 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3250 {
3251 instance_leader.put (instance, instance);
3252 vect_bb_partition_graph_r (bb_vinfo,
3253 instance, SLP_INSTANCE_TREE (instance),
3254 stmt_to_instance, instance_leader);
3255 }
3256
3257 /* Then collect entries to each independent subgraph. */
3258 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
3259 {
3260 slp_instance leader = get_ultimate_leader (instance, instance_leader);
3261 leader->subgraph_entries.safe_push (instance);
3262 if (dump_enabled_p ()
3263 && leader != instance)
3264 dump_printf_loc (MSG_NOTE, vect_location,
3265 "instance %p is leader of %p\n",
3266 leader, instance);
3267 }
3268 }
3269
3270 /* Compute the scalar cost of the SLP node NODE and its children
3271 and return it. Do not account defs that are marked in LIFE and
3272 update LIFE according to uses of NODE. */
3273
3274 static void
3275 vect_bb_slp_scalar_cost (vec_info *vinfo,
3276 slp_tree node, vec<bool, va_heap> *life,
3277 stmt_vector_for_cost *cost_vec,
3278 hash_set<slp_tree> &visited)
3279 {
3280 unsigned i;
3281 stmt_vec_info stmt_info;
3282 slp_tree child;
3283
3284 if (visited.add (node))
3285 return;
3286
3287 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3288 {
3289 ssa_op_iter op_iter;
3290 def_operand_p def_p;
3291
3292 if ((*life)[i])
3293 continue;
3294
3295 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3296 gimple *orig_stmt = orig_stmt_info->stmt;
3297
3298 /* If there is a non-vectorized use of the defs then the scalar
3299 stmt is kept live in which case we do not account it or any
3300 required defs in the SLP children in the scalar cost. This
3301 way we make the vectorization more costly when compared to
3302 the scalar cost. */
3303 if (!STMT_VINFO_LIVE_P (stmt_info))
3304 {
3305 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3306 {
3307 imm_use_iterator use_iter;
3308 gimple *use_stmt;
3309 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3310 if (!is_gimple_debug (use_stmt))
3311 {
3312 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3313 if (!use_stmt_info
3314 || !PURE_SLP_STMT
3315 (vect_stmt_to_vectorize (use_stmt_info)))
3316 {
3317 (*life)[i] = true;
3318 BREAK_FROM_IMM_USE_STMT (use_iter);
3319 }
3320 }
3321 }
3322 if ((*life)[i])
3323 continue;
3324 }
3325
3326 /* Count scalar stmts only once. */
3327 if (gimple_visited_p (orig_stmt))
3328 continue;
3329 gimple_set_visited (orig_stmt, true);
3330
3331 vect_cost_for_stmt kind;
3332 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3333 {
3334 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3335 kind = scalar_load;
3336 else
3337 kind = scalar_store;
3338 }
3339 else if (vect_nop_conversion_p (orig_stmt_info))
3340 continue;
3341 else
3342 kind = scalar_stmt;
3343 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3344 }
3345
3346 auto_vec<bool, 20> subtree_life;
3347 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3348 {
3349 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3350 {
3351 /* Do not directly pass LIFE to the recursive call, copy it to
3352 confine changes in the callee to the current child/subtree. */
3353 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3354 {
3355 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
3356 for (unsigned j = 0;
3357 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3358 {
3359 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3360 if (perm.first == i)
3361 subtree_life[perm.second] = (*life)[j];
3362 }
3363 }
3364 else
3365 {
3366 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3367 subtree_life.safe_splice (*life);
3368 }
3369 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3370 visited);
3371 subtree_life.truncate (0);
3372 }
3373 }
3374 }
3375
3376 /* Check if vectorization of the basic block is profitable for the
3377 subgraph denoted by SLP_INSTANCES. */
3378
3379 static bool
3380 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
3381 vec<slp_instance> slp_instances)
3382 {
3383 slp_instance instance;
3384 int i;
3385 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3386 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3387
3388 void *vect_target_cost_data = init_cost (NULL);
3389
3390 /* Calculate scalar cost and sum the cost for the vector stmts
3391 previously collected. */
3392 stmt_vector_for_cost scalar_costs;
3393 scalar_costs.create (0);
3394 hash_set<slp_tree> visited;
3395 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3396 {
3397 auto_vec<bool, 20> life;
3398 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
3399 true);
3400 vect_bb_slp_scalar_cost (bb_vinfo,
3401 SLP_INSTANCE_TREE (instance),
3402 &life, &scalar_costs, visited);
3403 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
3404 instance->cost_vec.release ();
3405 }
3406 /* Unset visited flag. */
3407 stmt_info_for_cost *si;
3408 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3409 gimple_set_visited (si->stmt_info->stmt, false);
3410
3411 void *scalar_target_cost_data = init_cost (NULL);
3412 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
3413 scalar_costs.release ();
3414 unsigned dummy;
3415 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
3416 destroy_cost_data (scalar_target_cost_data);
3417
3418 /* Complete the target-specific vector cost calculation. */
3419 finish_cost (vect_target_cost_data, &vec_prologue_cost,
3420 &vec_inside_cost, &vec_epilogue_cost);
3421 destroy_cost_data (vect_target_cost_data);
3422
3423 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3424
3425 if (dump_enabled_p ())
3426 {
3427 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3428 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3429 vec_inside_cost);
3430 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3431 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3432 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3433 }
3434
3435 /* Vectorization is profitable if its cost is more than the cost of scalar
3436 version. Note that we err on the vector side for equal cost because
3437 the cost estimate is otherwise quite pessimistic (constant uses are
3438 free on the scalar side but cost a load on the vector side for
3439 example). */
3440 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3441 return false;
3442
3443 return true;
3444 }
3445
3446 /* Find any vectorizable constructors and add them to the grouped_store
3447 array. */
3448
3449 static void
3450 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3451 {
3452 for (gimple *stmt : bb_vinfo->region_stmts ())
3453 {
3454 gassign *assign = dyn_cast<gassign *> (stmt);
3455 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3456 continue;
3457
3458 tree rhs = gimple_assign_rhs1 (assign);
3459 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3460 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3461 CONSTRUCTOR_NELTS (rhs))
3462 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3463 || uniform_vector_p (rhs))
3464 continue;
3465
3466 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3467 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3468 }
3469 }
3470
3471 /* Check if the region described by BB_VINFO can be vectorized, returning
3472 true if so. When returning false, set FATAL to true if the same failure
3473 would prevent vectorization at other vector sizes, false if it is still
3474 worth trying other sizes. N_STMTS is the number of statements in the
3475 region. */
3476
3477 static bool
3478 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3479 vec<int> *dataref_groups)
3480 {
3481 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3482
3483 slp_instance instance;
3484 int i;
3485 poly_uint64 min_vf = 2;
3486
3487 /* The first group of checks is independent of the vector size. */
3488 fatal = true;
3489
3490 /* Analyze the data references. */
3491
3492 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3493 {
3494 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3496 "not vectorized: unhandled data-ref in basic "
3497 "block.\n");
3498 return false;
3499 }
3500
3501 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3502 {
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505 "not vectorized: unhandled data access in "
3506 "basic block.\n");
3507 return false;
3508 }
3509
3510 vect_slp_check_for_constructors (bb_vinfo);
3511
3512 /* If there are no grouped stores and no constructors in the region
3513 there is no need to continue with pattern recog as vect_analyze_slp
3514 will fail anyway. */
3515 if (bb_vinfo->grouped_stores.is_empty ())
3516 {
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3519 "not vectorized: no grouped stores in "
3520 "basic block.\n");
3521 return false;
3522 }
3523
3524 /* While the rest of the analysis below depends on it in some way. */
3525 fatal = false;
3526
3527 vect_pattern_recog (bb_vinfo);
3528
3529 /* Check the SLP opportunities in the basic block, analyze and build SLP
3530 trees. */
3531 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3532 {
3533 if (dump_enabled_p ())
3534 {
3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3536 "Failed to SLP the basic block.\n");
3537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3538 "not vectorized: failed to find SLP opportunities "
3539 "in basic block.\n");
3540 }
3541 return false;
3542 }
3543
3544 /* Optimize permutations. */
3545 vect_optimize_slp (bb_vinfo);
3546
3547 vect_record_base_alignments (bb_vinfo);
3548
3549 /* Analyze and verify the alignment of data references and the
3550 dependence in the SLP instances. */
3551 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3552 {
3553 vect_location = instance->location ();
3554 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3555 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3556 {
3557 slp_tree node = SLP_INSTANCE_TREE (instance);
3558 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3559 if (dump_enabled_p ())
3560 dump_printf_loc (MSG_NOTE, vect_location,
3561 "removing SLP instance operations starting from: %G",
3562 stmt_info->stmt);
3563 vect_free_slp_instance (instance);
3564 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3565 continue;
3566 }
3567
3568 /* Mark all the statements that we want to vectorize as pure SLP and
3569 relevant. */
3570 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3571 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3572 if (SLP_INSTANCE_ROOT_STMT (instance))
3573 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3574
3575 i++;
3576 }
3577 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3578 return false;
3579
3580 if (!vect_slp_analyze_operations (bb_vinfo))
3581 {
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "not vectorized: bad operation in basic block.\n");
3585 return false;
3586 }
3587
3588 vect_bb_partition_graph (bb_vinfo);
3589
3590 return true;
3591 }
3592
3593 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3594 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3595 on success. The region has N_STMTS statements and has the datarefs
3596 given by DATAREFS. */
3597
3598 static bool
3599 vect_slp_region (gimple_stmt_iterator region_begin,
3600 gimple_stmt_iterator region_end,
3601 vec<data_reference_p> datarefs,
3602 vec<int> *dataref_groups,
3603 unsigned int n_stmts)
3604 {
3605 bb_vec_info bb_vinfo;
3606 auto_vector_modes vector_modes;
3607
3608 /* Autodetect first vector size we try. */
3609 machine_mode next_vector_mode = VOIDmode;
3610 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3611 unsigned int mode_i = 0;
3612
3613 vec_info_shared shared;
3614
3615 machine_mode autodetected_vector_mode = VOIDmode;
3616 while (1)
3617 {
3618 bool vectorized = false;
3619 bool fatal = false;
3620 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3621
3622 bool first_time_p = shared.datarefs.is_empty ();
3623 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3624 if (first_time_p)
3625 bb_vinfo->shared->save_datarefs ();
3626 else
3627 bb_vinfo->shared->check_datarefs ();
3628 bb_vinfo->vector_mode = next_vector_mode;
3629
3630 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
3631 && dbg_cnt (vect_slp))
3632 {
3633 if (dump_enabled_p ())
3634 {
3635 dump_printf_loc (MSG_NOTE, vect_location,
3636 "***** Analysis succeeded with vector mode"
3637 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3638 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3639 }
3640
3641 bb_vinfo->shared->check_datarefs ();
3642
3643 unsigned i;
3644 slp_instance instance;
3645 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
3646 {
3647 if (instance->subgraph_entries.is_empty ())
3648 continue;
3649
3650 vect_location = instance->location ();
3651 if (!unlimited_cost_model (NULL)
3652 && !vect_bb_vectorization_profitable_p
3653 (bb_vinfo, instance->subgraph_entries))
3654 {
3655 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3657 "not vectorized: vectorization is not "
3658 "profitable.\n");
3659 continue;
3660 }
3661
3662 if (!vectorized && dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE, vect_location,
3664 "Basic block will be vectorized "
3665 "using SLP\n");
3666 vectorized = true;
3667
3668 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
3669
3670 unsigned HOST_WIDE_INT bytes;
3671 if (dump_enabled_p ())
3672 {
3673 if (GET_MODE_SIZE
3674 (bb_vinfo->vector_mode).is_constant (&bytes))
3675 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3676 "basic block part vectorized using %wu "
3677 "byte vectors\n", bytes);
3678 else
3679 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3680 "basic block part vectorized using "
3681 "variable length vectors\n");
3682 }
3683 }
3684 }
3685 else
3686 {
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 "***** Analysis failed with vector mode %s\n",
3690 GET_MODE_NAME (bb_vinfo->vector_mode));
3691 }
3692
3693 if (mode_i == 0)
3694 autodetected_vector_mode = bb_vinfo->vector_mode;
3695
3696 if (!fatal)
3697 while (mode_i < vector_modes.length ()
3698 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3699 {
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE, vect_location,
3702 "***** The result for vector mode %s would"
3703 " be the same\n",
3704 GET_MODE_NAME (vector_modes[mode_i]));
3705 mode_i += 1;
3706 }
3707
3708 delete bb_vinfo;
3709
3710 if (mode_i < vector_modes.length ()
3711 && VECTOR_MODE_P (autodetected_vector_mode)
3712 && (related_vector_mode (vector_modes[mode_i],
3713 GET_MODE_INNER (autodetected_vector_mode))
3714 == autodetected_vector_mode)
3715 && (related_vector_mode (autodetected_vector_mode,
3716 GET_MODE_INNER (vector_modes[mode_i]))
3717 == vector_modes[mode_i]))
3718 {
3719 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE, vect_location,
3721 "***** Skipping vector mode %s, which would"
3722 " repeat the analysis for %s\n",
3723 GET_MODE_NAME (vector_modes[mode_i]),
3724 GET_MODE_NAME (autodetected_vector_mode));
3725 mode_i += 1;
3726 }
3727
3728 if (vectorized
3729 || mode_i == vector_modes.length ()
3730 || autodetected_vector_mode == VOIDmode
3731 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3732 vector sizes will fail do not bother iterating. */
3733 || fatal)
3734 return vectorized;
3735
3736 /* Try the next biggest vector size. */
3737 next_vector_mode = vector_modes[mode_i++];
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "***** Re-trying analysis with vector mode %s\n",
3741 GET_MODE_NAME (next_vector_mode));
3742 }
3743 }
3744
3745 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3746 true if anything in the basic-block was vectorized. */
3747
3748 bool
3749 vect_slp_bb (basic_block bb)
3750 {
3751 vec<data_reference_p> datarefs = vNULL;
3752 vec<int> dataref_groups = vNULL;
3753 int insns = 0;
3754 int current_group = 0;
3755 gimple_stmt_iterator region_begin = gsi_start_nondebug_after_labels_bb (bb);
3756 gimple_stmt_iterator region_end = gsi_last_bb (bb);
3757 if (!gsi_end_p (region_end))
3758 gsi_next (&region_end);
3759
3760 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
3761 gsi_next (&gsi))
3762 {
3763 gimple *stmt = gsi_stmt (gsi);
3764 if (is_gimple_debug (stmt))
3765 continue;
3766
3767 insns++;
3768
3769 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3770 vect_location = stmt;
3771
3772 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
3773 &dataref_groups, current_group))
3774 ++current_group;
3775
3776 if (insns > param_slp_max_insns_in_bb)
3777 {
3778 if (dump_enabled_p ())
3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780 "not vectorized: too many instructions in "
3781 "basic block.\n");
3782 }
3783 }
3784
3785 return vect_slp_region (region_begin, region_end, datarefs,
3786 &dataref_groups, insns);
3787 }
3788
3789
3790 /* Build a variable-length vector in which the elements in ELTS are repeated
3791 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3792 RESULTS and add any new instructions to SEQ.
3793
3794 The approach we use is:
3795
3796 (1) Find a vector mode VM with integer elements of mode IM.
3797
3798 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3799 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3800 from small vectors to IM.
3801
3802 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3803
3804 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3805 correct byte contents.
3806
3807 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3808
3809 We try to find the largest IM for which this sequence works, in order
3810 to cut down on the number of interleaves. */
3811
3812 void
3813 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3814 vec<tree> elts, unsigned int nresults,
3815 vec<tree> &results)
3816 {
3817 unsigned int nelts = elts.length ();
3818 tree element_type = TREE_TYPE (vector_type);
3819
3820 /* (1) Find a vector mode VM with integer elements of mode IM. */
3821 unsigned int nvectors = 1;
3822 tree new_vector_type;
3823 tree permutes[2];
3824 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3825 &nvectors, &new_vector_type,
3826 permutes))
3827 gcc_unreachable ();
3828
3829 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3830 unsigned int partial_nelts = nelts / nvectors;
3831 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3832
3833 tree_vector_builder partial_elts;
3834 auto_vec<tree, 32> pieces (nvectors * 2);
3835 pieces.quick_grow (nvectors * 2);
3836 for (unsigned int i = 0; i < nvectors; ++i)
3837 {
3838 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3839 ELTS' has mode IM. */
3840 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3841 for (unsigned int j = 0; j < partial_nelts; ++j)
3842 partial_elts.quick_push (elts[i * partial_nelts + j]);
3843 tree t = gimple_build_vector (seq, &partial_elts);
3844 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3845 TREE_TYPE (new_vector_type), t);
3846
3847 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3848 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3849 }
3850
3851 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3852 correct byte contents.
3853
3854 We need to repeat the following operation log2(nvectors) times:
3855
3856 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3857 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3858
3859 However, if each input repeats every N elements and the VF is
3860 a multiple of N * 2, the HI result is the same as the LO. */
3861 unsigned int in_start = 0;
3862 unsigned int out_start = nvectors;
3863 unsigned int hi_start = nvectors / 2;
3864 /* A bound on the number of outputs needed to produce NRESULTS results
3865 in the final iteration. */
3866 unsigned int noutputs_bound = nvectors * nresults;
3867 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3868 {
3869 noutputs_bound /= 2;
3870 unsigned int limit = MIN (noutputs_bound, nvectors);
3871 for (unsigned int i = 0; i < limit; ++i)
3872 {
3873 if ((i & 1) != 0
3874 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3875 2 * in_repeat))
3876 {
3877 pieces[out_start + i] = pieces[out_start + i - 1];
3878 continue;
3879 }
3880
3881 tree output = make_ssa_name (new_vector_type);
3882 tree input1 = pieces[in_start + (i / 2)];
3883 tree input2 = pieces[in_start + (i / 2) + hi_start];
3884 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3885 input1, input2,
3886 permutes[i & 1]);
3887 gimple_seq_add_stmt (seq, stmt);
3888 pieces[out_start + i] = output;
3889 }
3890 std::swap (in_start, out_start);
3891 }
3892
3893 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3894 results.reserve (nresults);
3895 for (unsigned int i = 0; i < nresults; ++i)
3896 if (i < nvectors)
3897 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3898 pieces[in_start + i]));
3899 else
3900 results.quick_push (results[i - nvectors]);
3901 }
3902
3903
3904 /* For constant and loop invariant defs in OP_NODE this function creates
3905 vector defs that will be used in the vectorized stmts and stores them
3906 to SLP_TREE_VEC_DEFS of OP_NODE. */
3907
3908 static void
3909 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3910 {
3911 unsigned HOST_WIDE_INT nunits;
3912 tree vec_cst;
3913 unsigned j, number_of_places_left_in_vector;
3914 tree vector_type;
3915 tree vop;
3916 int group_size = op_node->ops.length ();
3917 unsigned int vec_num, i;
3918 unsigned number_of_copies = 1;
3919 bool constant_p;
3920 gimple_seq ctor_seq = NULL;
3921 auto_vec<tree, 16> permute_results;
3922
3923 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3924 vector_type = SLP_TREE_VECTYPE (op_node);
3925
3926 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3927 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3928 auto_vec<tree> voprnds (number_of_vectors);
3929
3930 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3931 created vectors. It is greater than 1 if unrolling is performed.
3932
3933 For example, we have two scalar operands, s1 and s2 (e.g., group of
3934 strided accesses of size two), while NUNITS is four (i.e., four scalars
3935 of this type can be packed in a vector). The output vector will contain
3936 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3937 will be 2).
3938
3939 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3940 containing the operands.
3941
3942 For example, NUNITS is four as before, and the group size is 8
3943 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3944 {s5, s6, s7, s8}. */
3945
3946 /* When using duplicate_and_interleave, we just need one element for
3947 each scalar statement. */
3948 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3949 nunits = group_size;
3950
3951 number_of_copies = nunits * number_of_vectors / group_size;
3952
3953 number_of_places_left_in_vector = nunits;
3954 constant_p = true;
3955 tree_vector_builder elts (vector_type, nunits, 1);
3956 elts.quick_grow (nunits);
3957 stmt_vec_info insert_after = NULL;
3958 for (j = 0; j < number_of_copies; j++)
3959 {
3960 tree op;
3961 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3962 {
3963 /* Create 'vect_ = {op0,op1,...,opn}'. */
3964 number_of_places_left_in_vector--;
3965 tree orig_op = op;
3966 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3967 {
3968 if (CONSTANT_CLASS_P (op))
3969 {
3970 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3971 {
3972 /* Can't use VIEW_CONVERT_EXPR for booleans because
3973 of possibly different sizes of scalar value and
3974 vector element. */
3975 if (integer_zerop (op))
3976 op = build_int_cst (TREE_TYPE (vector_type), 0);
3977 else if (integer_onep (op))
3978 op = build_all_ones_cst (TREE_TYPE (vector_type));
3979 else
3980 gcc_unreachable ();
3981 }
3982 else
3983 op = fold_unary (VIEW_CONVERT_EXPR,
3984 TREE_TYPE (vector_type), op);
3985 gcc_assert (op && CONSTANT_CLASS_P (op));
3986 }
3987 else
3988 {
3989 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3990 gimple *init_stmt;
3991 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3992 {
3993 tree true_val
3994 = build_all_ones_cst (TREE_TYPE (vector_type));
3995 tree false_val
3996 = build_zero_cst (TREE_TYPE (vector_type));
3997 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3998 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3999 op, true_val,
4000 false_val);
4001 }
4002 else
4003 {
4004 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
4005 op);
4006 init_stmt
4007 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
4008 op);
4009 }
4010 gimple_seq_add_stmt (&ctor_seq, init_stmt);
4011 op = new_temp;
4012 }
4013 }
4014 elts[number_of_places_left_in_vector] = op;
4015 if (!CONSTANT_CLASS_P (op))
4016 constant_p = false;
4017 /* For BB vectorization we have to compute an insert location
4018 when a def is inside the analyzed region since we cannot
4019 simply insert at the BB start in this case. */
4020 stmt_vec_info opdef;
4021 if (TREE_CODE (orig_op) == SSA_NAME
4022 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
4023 && is_a <bb_vec_info> (vinfo)
4024 && (opdef = vinfo->lookup_def (orig_op)))
4025 {
4026 if (!insert_after)
4027 insert_after = opdef;
4028 else
4029 insert_after = get_later_stmt (insert_after, opdef);
4030 }
4031
4032 if (number_of_places_left_in_vector == 0)
4033 {
4034 if (constant_p
4035 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4036 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4037 vec_cst = gimple_build_vector (&ctor_seq, &elts);
4038 else
4039 {
4040 if (permute_results.is_empty ())
4041 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
4042 elts, number_of_vectors,
4043 permute_results);
4044 vec_cst = permute_results[number_of_vectors - j - 1];
4045 }
4046 if (!gimple_seq_empty_p (ctor_seq))
4047 {
4048 if (insert_after)
4049 {
4050 gimple_stmt_iterator gsi
4051 = gsi_for_stmt (insert_after->stmt);
4052 gsi_insert_seq_after (&gsi, ctor_seq,
4053 GSI_CONTINUE_LINKING);
4054 }
4055 else
4056 vinfo->insert_seq_on_entry (NULL, ctor_seq);
4057 ctor_seq = NULL;
4058 }
4059 voprnds.quick_push (vec_cst);
4060 insert_after = NULL;
4061 number_of_places_left_in_vector = nunits;
4062 constant_p = true;
4063 elts.new_vector (vector_type, nunits, 1);
4064 elts.quick_grow (nunits);
4065 }
4066 }
4067 }
4068
4069 /* Since the vectors are created in the reverse order, we should invert
4070 them. */
4071 vec_num = voprnds.length ();
4072 for (j = vec_num; j != 0; j--)
4073 {
4074 vop = voprnds[j - 1];
4075 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4076 }
4077
4078 /* In case that VF is greater than the unrolling factor needed for the SLP
4079 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4080 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4081 to replicate the vectors. */
4082 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
4083 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
4084 i++)
4085 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
4086 }
4087
4088 /* Get the Ith vectorized definition from SLP_NODE. */
4089
4090 tree
4091 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
4092 {
4093 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
4094 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
4095 else
4096 return SLP_TREE_VEC_DEFS (slp_node)[i];
4097 }
4098
4099 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4100
4101 void
4102 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
4103 {
4104 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
4105 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
4106 {
4107 unsigned j;
4108 gimple *vec_def_stmt;
4109 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
4110 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
4111 }
4112 else
4113 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
4114 }
4115
4116 /* Get N vectorized definitions for SLP_NODE. */
4117
4118 void
4119 vect_get_slp_defs (vec_info *,
4120 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
4121 {
4122 if (n == -1U)
4123 n = SLP_TREE_CHILDREN (slp_node).length ();
4124
4125 for (unsigned i = 0; i < n; ++i)
4126 {
4127 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
4128 vec<tree> vec_defs = vNULL;
4129 vect_get_slp_defs (child, &vec_defs);
4130 vec_oprnds->quick_push (vec_defs);
4131 }
4132 }
4133
4134 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4135 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4136 permute statements for the SLP node NODE. */
4137
4138 bool
4139 vect_transform_slp_perm_load (vec_info *vinfo,
4140 slp_tree node, vec<tree> dr_chain,
4141 gimple_stmt_iterator *gsi, poly_uint64 vf,
4142 bool analyze_only, unsigned *n_perms)
4143 {
4144 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4145 int vec_index = 0;
4146 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4147 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4148 unsigned int mask_element;
4149 machine_mode mode;
4150
4151 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
4152 return false;
4153
4154 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
4155
4156 mode = TYPE_MODE (vectype);
4157 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4158
4159 /* Initialize the vect stmts of NODE to properly insert the generated
4160 stmts later. */
4161 if (! analyze_only)
4162 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
4163 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
4164 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
4165
4166 /* Generate permutation masks for every NODE. Number of masks for each NODE
4167 is equal to GROUP_SIZE.
4168 E.g., we have a group of three nodes with three loads from the same
4169 location in each node, and the vector size is 4. I.e., we have a
4170 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4171 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4172 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4173 ...
4174
4175 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4176 The last mask is illegal since we assume two operands for permute
4177 operation, and the mask element values can't be outside that range.
4178 Hence, the last mask must be converted into {2,5,5,5}.
4179 For the first two permutations we need the first and the second input
4180 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4181 we need the second and the third vectors: {b1,c1,a2,b2} and
4182 {c2,a3,b3,c3}. */
4183
4184 int vect_stmts_counter = 0;
4185 unsigned int index = 0;
4186 int first_vec_index = -1;
4187 int second_vec_index = -1;
4188 bool noop_p = true;
4189 *n_perms = 0;
4190
4191 vec_perm_builder mask;
4192 unsigned int nelts_to_build;
4193 unsigned int nvectors_per_build;
4194 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
4195 && multiple_p (nunits, group_size));
4196 if (repeating_p)
4197 {
4198 /* A single vector contains a whole number of copies of the node, so:
4199 (a) all permutes can use the same mask; and
4200 (b) the permutes only need a single vector input. */
4201 mask.new_vector (nunits, group_size, 3);
4202 nelts_to_build = mask.encoded_nelts ();
4203 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4204 }
4205 else
4206 {
4207 /* We need to construct a separate mask for each vector statement. */
4208 unsigned HOST_WIDE_INT const_nunits, const_vf;
4209 if (!nunits.is_constant (&const_nunits)
4210 || !vf.is_constant (&const_vf))
4211 return false;
4212 mask.new_vector (const_nunits, const_nunits, 1);
4213 nelts_to_build = const_vf * group_size;
4214 nvectors_per_build = 1;
4215 }
4216
4217 unsigned int count = mask.encoded_nelts ();
4218 mask.quick_grow (count);
4219 vec_perm_indices indices;
4220
4221 for (unsigned int j = 0; j < nelts_to_build; j++)
4222 {
4223 unsigned int iter_num = j / group_size;
4224 unsigned int stmt_num = j % group_size;
4225 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4226 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4227 if (repeating_p)
4228 {
4229 first_vec_index = 0;
4230 mask_element = i;
4231 }
4232 else
4233 {
4234 /* Enforced before the loop when !repeating_p. */
4235 unsigned int const_nunits = nunits.to_constant ();
4236 vec_index = i / const_nunits;
4237 mask_element = i % const_nunits;
4238 if (vec_index == first_vec_index
4239 || first_vec_index == -1)
4240 {
4241 first_vec_index = vec_index;
4242 }
4243 else if (vec_index == second_vec_index
4244 || second_vec_index == -1)
4245 {
4246 second_vec_index = vec_index;
4247 mask_element += const_nunits;
4248 }
4249 else
4250 {
4251 if (dump_enabled_p ())
4252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4253 "permutation requires at "
4254 "least three vectors %G",
4255 stmt_info->stmt);
4256 gcc_assert (analyze_only);
4257 return false;
4258 }
4259
4260 gcc_assert (mask_element < 2 * const_nunits);
4261 }
4262
4263 if (mask_element != index)
4264 noop_p = false;
4265 mask[index++] = mask_element;
4266
4267 if (index == count && !noop_p)
4268 {
4269 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4270 if (!can_vec_perm_const_p (mode, indices))
4271 {
4272 if (dump_enabled_p ())
4273 {
4274 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4275 vect_location,
4276 "unsupported vect permute { ");
4277 for (i = 0; i < count; ++i)
4278 {
4279 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4280 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4281 }
4282 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4283 }
4284 gcc_assert (analyze_only);
4285 return false;
4286 }
4287
4288 ++*n_perms;
4289 }
4290
4291 if (index == count)
4292 {
4293 if (!analyze_only)
4294 {
4295 tree mask_vec = NULL_TREE;
4296
4297 if (! noop_p)
4298 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4299
4300 if (second_vec_index == -1)
4301 second_vec_index = first_vec_index;
4302
4303 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4304 {
4305 /* Generate the permute statement if necessary. */
4306 tree first_vec = dr_chain[first_vec_index + ri];
4307 tree second_vec = dr_chain[second_vec_index + ri];
4308 gimple *perm_stmt;
4309 if (! noop_p)
4310 {
4311 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4312 tree perm_dest
4313 = vect_create_destination_var (gimple_assign_lhs (stmt),
4314 vectype);
4315 perm_dest = make_ssa_name (perm_dest);
4316 perm_stmt
4317 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4318 first_vec, second_vec,
4319 mask_vec);
4320 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4321 gsi);
4322 }
4323 else
4324 /* If mask was NULL_TREE generate the requested
4325 identity transform. */
4326 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4327
4328 /* Store the vector statement in NODE. */
4329 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4330 }
4331 }
4332
4333 index = 0;
4334 first_vec_index = -1;
4335 second_vec_index = -1;
4336 noop_p = true;
4337 }
4338 }
4339
4340 return true;
4341 }
4342
4343
4344 /* Vectorize the SLP permutations in NODE as specified
4345 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4346 child number and lane number.
4347 Interleaving of two two-lane two-child SLP subtrees (not supported):
4348 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4349 A blend of two four-lane two-child SLP subtrees:
4350 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4351 Highpart of a four-lane one-child SLP subtree (not supported):
4352 [ { 0, 2 }, { 0, 3 } ]
4353 Where currently only a subset is supported by code generating below. */
4354
4355 static bool
4356 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4357 slp_tree node, stmt_vector_for_cost *cost_vec)
4358 {
4359 tree vectype = SLP_TREE_VECTYPE (node);
4360
4361 /* ??? We currently only support all same vector input and output types
4362 while the SLP IL should really do a concat + select and thus accept
4363 arbitrary mismatches. */
4364 slp_tree child;
4365 unsigned i;
4366 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4367 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4368 {
4369 if (dump_enabled_p ())
4370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4371 "Unsupported lane permutation\n");
4372 return false;
4373 }
4374
4375 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4376 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4377 if (dump_enabled_p ())
4378 {
4379 dump_printf_loc (MSG_NOTE, vect_location,
4380 "vectorizing permutation");
4381 for (unsigned i = 0; i < perm.length (); ++i)
4382 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4383 dump_printf (MSG_NOTE, "\n");
4384 }
4385
4386 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4387 if (!nunits.is_constant ())
4388 return false;
4389 unsigned HOST_WIDE_INT vf = 1;
4390 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4391 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4392 return false;
4393 unsigned olanes = vf * SLP_TREE_LANES (node);
4394 gcc_assert (multiple_p (olanes, nunits));
4395
4396 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4397 from the { SLP operand, scalar lane } permutation as recorded in the
4398 SLP node as intermediate step. This part should already work
4399 with SLP children with arbitrary number of lanes. */
4400 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4401 auto_vec<unsigned> active_lane;
4402 vperm.create (olanes);
4403 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
4404 for (unsigned i = 0; i < vf; ++i)
4405 {
4406 for (unsigned pi = 0; pi < perm.length (); ++pi)
4407 {
4408 std::pair<unsigned, unsigned> p = perm[pi];
4409 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4410 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4411 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4412 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4413 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4414 }
4415 /* Advance to the next group. */
4416 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4417 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4418 }
4419
4420 if (dump_enabled_p ())
4421 {
4422 dump_printf_loc (MSG_NOTE, vect_location, "as");
4423 for (unsigned i = 0; i < vperm.length (); ++i)
4424 {
4425 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4426 dump_printf (MSG_NOTE, ",");
4427 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4428 vperm[i].first.first, vperm[i].first.second,
4429 vperm[i].first.second);
4430 }
4431 dump_printf (MSG_NOTE, "\n");
4432 }
4433
4434 /* We can only handle two-vector permutes, everything else should
4435 be lowered on the SLP level. The following is closely inspired
4436 by vect_transform_slp_perm_load and is supposed to eventually
4437 replace it.
4438 ??? As intermediate step do code-gen in the SLP tree representation
4439 somehow? */
4440 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4441 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4442 unsigned int const_nunits = nunits.to_constant ();
4443 unsigned int index = 0;
4444 unsigned int mask_element;
4445 vec_perm_builder mask;
4446 mask.new_vector (const_nunits, const_nunits, 1);
4447 unsigned int count = mask.encoded_nelts ();
4448 mask.quick_grow (count);
4449 vec_perm_indices indices;
4450 unsigned nperms = 0;
4451 for (unsigned i = 0; i < vperm.length (); ++i)
4452 {
4453 mask_element = vperm[i].second;
4454 if (first_vec.first == -1U
4455 || first_vec == vperm[i].first)
4456 first_vec = vperm[i].first;
4457 else if (second_vec.first == -1U
4458 || second_vec == vperm[i].first)
4459 {
4460 second_vec = vperm[i].first;
4461 mask_element += const_nunits;
4462 }
4463 else
4464 {
4465 if (dump_enabled_p ())
4466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4467 "permutation requires at "
4468 "least three vectors");
4469 gcc_assert (!gsi);
4470 return false;
4471 }
4472
4473 mask[index++] = mask_element;
4474
4475 if (index == count)
4476 {
4477 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4478 const_nunits);
4479 bool identity_p = indices.series_p (0, 1, 0, 1);
4480 if (!identity_p
4481 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4482 {
4483 if (dump_enabled_p ())
4484 {
4485 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4486 vect_location,
4487 "unsupported vect permute { ");
4488 for (i = 0; i < count; ++i)
4489 {
4490 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4491 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4492 }
4493 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4494 }
4495 gcc_assert (!gsi);
4496 return false;
4497 }
4498
4499 if (!identity_p)
4500 nperms++;
4501 if (gsi)
4502 {
4503 if (second_vec.first == -1U)
4504 second_vec = first_vec;
4505
4506 /* Generate the permute statement if necessary. */
4507 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4508 tree first_def
4509 = vect_get_slp_vect_def (first_node, first_vec.second);
4510 gassign *perm_stmt;
4511 tree perm_dest = make_ssa_name (vectype);
4512 if (!identity_p)
4513 {
4514 slp_tree second_node
4515 = SLP_TREE_CHILDREN (node)[second_vec.first];
4516 tree second_def
4517 = vect_get_slp_vect_def (second_node, second_vec.second);
4518 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4519 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4520 first_def, second_def,
4521 mask_vec);
4522 }
4523 else
4524 /* We need a copy here in case the def was external. */
4525 perm_stmt = gimple_build_assign (perm_dest, first_def);
4526 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4527 /* Store the vector statement in NODE. */
4528 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4529 }
4530
4531 index = 0;
4532 first_vec = std::make_pair (-1U, -1U);
4533 second_vec = std::make_pair (-1U, -1U);
4534 }
4535 }
4536
4537 if (!gsi)
4538 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4539
4540 return true;
4541 }
4542
4543 /* Vectorize SLP instance tree in postorder. */
4544
4545 static void
4546 vect_schedule_slp_instance (vec_info *vinfo,
4547 slp_tree node, slp_instance instance)
4548 {
4549 gimple_stmt_iterator si;
4550 int i;
4551 slp_tree child;
4552
4553 /* See if we have already vectorized the node in the graph of the
4554 SLP instance. */
4555 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4556 && SLP_TREE_VEC_STMTS (node).exists ())
4557 || SLP_TREE_VEC_DEFS (node).exists ())
4558 return;
4559
4560 /* Vectorize externals and constants. */
4561 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4562 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4563 {
4564 /* ??? vectorizable_shift can end up using a scalar operand which is
4565 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4566 node in this case. */
4567 if (!SLP_TREE_VECTYPE (node))
4568 return;
4569
4570 vect_create_constant_vectors (vinfo, node);
4571 return;
4572 }
4573
4574 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4575 vect_schedule_slp_instance (vinfo, child, instance);
4576
4577 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4578 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4579
4580 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4581 if (dump_enabled_p ())
4582 dump_printf_loc (MSG_NOTE, vect_location,
4583 "------>vectorizing SLP node starting from: %G",
4584 stmt_info->stmt);
4585
4586 if (STMT_VINFO_DATA_REF (stmt_info)
4587 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4588 {
4589 /* Vectorized loads go before the first scalar load to make it
4590 ready early, vectorized stores go before the last scalar
4591 stmt which is where all uses are ready. */
4592 stmt_vec_info last_stmt_info = NULL;
4593 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4594 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4595 else /* DR_IS_WRITE */
4596 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4597 si = gsi_for_stmt (last_stmt_info->stmt);
4598 }
4599 else if (SLP_TREE_CHILDREN (node).is_empty ())
4600 {
4601 /* This happens for reduction and induction PHIs where we do not use the
4602 insertion iterator. */
4603 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4604 == cycle_phi_info_type
4605 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4606 == induc_vec_info_type));
4607 si = gsi_none ();
4608 }
4609 else
4610 {
4611 /* Emit other stmts after the children vectorized defs which is
4612 earliest possible. */
4613 gimple *last_stmt = NULL;
4614 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4615 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4616 {
4617 /* For fold-left reductions we are retaining the scalar
4618 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4619 set so the representation isn't perfect. Resort to the
4620 last scalar def here. */
4621 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4622 {
4623 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4624 == cycle_phi_info_type);
4625 gphi *phi = as_a <gphi *>
4626 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4627 if (!last_stmt
4628 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4629 last_stmt = phi;
4630 }
4631 /* We are emitting all vectorized stmts in the same place and
4632 the last one is the last.
4633 ??? Unless we have a load permutation applied and that
4634 figures to re-use an earlier generated load. */
4635 unsigned j;
4636 gimple *vstmt;
4637 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4638 if (!last_stmt
4639 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4640 last_stmt = vstmt;
4641 }
4642 else if (!SLP_TREE_VECTYPE (child))
4643 {
4644 /* For externals we use unvectorized at all scalar defs. */
4645 unsigned j;
4646 tree def;
4647 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4648 if (TREE_CODE (def) == SSA_NAME
4649 && !SSA_NAME_IS_DEFAULT_DEF (def))
4650 {
4651 gimple *stmt = SSA_NAME_DEF_STMT (def);
4652 if (!last_stmt
4653 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4654 last_stmt = stmt;
4655 }
4656 }
4657 else
4658 {
4659 /* For externals we have to look at all defs since their
4660 insertion place is decided per vector. But beware
4661 of pre-existing vectors where we need to make sure
4662 we do not insert before the region boundary. */
4663 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
4664 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
4665 last_stmt = gsi_stmt (as_a <bb_vec_info> (vinfo)->region_begin);
4666 else
4667 {
4668 unsigned j;
4669 tree vdef;
4670 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4671 if (TREE_CODE (vdef) == SSA_NAME
4672 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
4673 {
4674 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4675 if (!last_stmt
4676 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4677 last_stmt = vstmt;
4678 }
4679 }
4680 }
4681 /* This can happen when all children are pre-existing vectors or
4682 constants. */
4683 if (!last_stmt)
4684 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
4685 if (is_a <gphi *> (last_stmt))
4686 si = gsi_after_labels (gimple_bb (last_stmt));
4687 else
4688 {
4689 si = gsi_for_stmt (last_stmt);
4690 gsi_next (&si);
4691 }
4692 }
4693
4694 bool done_p = false;
4695
4696 /* Handle purely internal nodes. */
4697 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4698 {
4699 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4700 be shared with different SLP nodes (but usually it's the same
4701 operation apart from the case the stmt is only there for denoting
4702 the actual scalar lane defs ...). So do not call vect_transform_stmt
4703 but open-code it here (partly). */
4704 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4705 gcc_assert (done);
4706 done_p = true;
4707 }
4708 if (!done_p)
4709 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4710 }
4711
4712 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4713 For loop vectorization this is done in vectorizable_call, but for SLP
4714 it needs to be deferred until end of vect_schedule_slp, because multiple
4715 SLP instances may refer to the same scalar stmt. */
4716
4717 static void
4718 vect_remove_slp_scalar_calls (vec_info *vinfo,
4719 slp_tree node, hash_set<slp_tree> &visited)
4720 {
4721 gimple *new_stmt;
4722 gimple_stmt_iterator gsi;
4723 int i;
4724 slp_tree child;
4725 tree lhs;
4726 stmt_vec_info stmt_info;
4727
4728 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4729 return;
4730
4731 if (visited.add (node))
4732 return;
4733
4734 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4735 vect_remove_slp_scalar_calls (vinfo, child, visited);
4736
4737 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4738 {
4739 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4740 if (!stmt || gimple_bb (stmt) == NULL)
4741 continue;
4742 if (is_pattern_stmt_p (stmt_info)
4743 || !PURE_SLP_STMT (stmt_info))
4744 continue;
4745 lhs = gimple_call_lhs (stmt);
4746 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4747 gsi = gsi_for_stmt (stmt);
4748 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4749 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4750 }
4751 }
4752
4753 static void
4754 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4755 {
4756 hash_set<slp_tree> visited;
4757 vect_remove_slp_scalar_calls (vinfo, node, visited);
4758 }
4759
4760 /* Vectorize the instance root. */
4761
4762 void
4763 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4764 {
4765 gassign *rstmt = NULL;
4766
4767 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4768 {
4769 gimple *child_stmt;
4770 int j;
4771
4772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4773 {
4774 tree vect_lhs = gimple_get_lhs (child_stmt);
4775 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4776 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4777 TREE_TYPE (vect_lhs)))
4778 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4779 vect_lhs);
4780 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4781 break;
4782 }
4783 }
4784 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4785 {
4786 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4787 gimple *child_stmt;
4788 int j;
4789 vec<constructor_elt, va_gc> *v;
4790 vec_alloc (v, nelts);
4791
4792 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4793 {
4794 CONSTRUCTOR_APPEND_ELT (v,
4795 NULL_TREE,
4796 gimple_get_lhs (child_stmt));
4797 }
4798 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4799 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4800 tree r_constructor = build_constructor (rtype, v);
4801 rstmt = gimple_build_assign (lhs, r_constructor);
4802 }
4803
4804 gcc_assert (rstmt);
4805
4806 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4807 gsi_replace (&rgsi, rstmt, true);
4808 }
4809
4810 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
4811
4812 void
4813 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
4814 {
4815 slp_instance instance;
4816 unsigned int i;
4817
4818 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4819 {
4820 slp_tree node = SLP_INSTANCE_TREE (instance);
4821 if (dump_enabled_p ())
4822 {
4823 dump_printf_loc (MSG_NOTE, vect_location,
4824 "Vectorizing SLP tree:\n");
4825 if (SLP_INSTANCE_ROOT_STMT (instance))
4826 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
4827 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
4828 vect_print_slp_graph (MSG_NOTE, vect_location,
4829 SLP_INSTANCE_TREE (instance));
4830 }
4831 /* Schedule the tree of INSTANCE. */
4832 vect_schedule_slp_instance (vinfo, node, instance);
4833
4834 if (SLP_INSTANCE_ROOT_STMT (instance))
4835 vectorize_slp_instance_root_stmt (node, instance);
4836
4837 if (dump_enabled_p ())
4838 dump_printf_loc (MSG_NOTE, vect_location,
4839 "vectorizing stmts using SLP.\n");
4840 }
4841
4842 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4843 {
4844 slp_tree root = SLP_INSTANCE_TREE (instance);
4845 stmt_vec_info store_info;
4846 unsigned int j;
4847
4848 /* For reductions set the latch values of the vectorized PHIs. */
4849 if (instance->reduc_phis
4850 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4851 (instance->reduc_phis)) != FOLD_LEFT_REDUCTION
4852 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4853 (instance->reduc_phis)) != EXTRACT_LAST_REDUCTION)
4854 {
4855 slp_tree slp_node = root;
4856 slp_tree phi_node = instance->reduc_phis;
4857 gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
4858 edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
4859 gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
4860 == SLP_TREE_VEC_STMTS (slp_node).length ());
4861 for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
4862 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
4863 vect_get_slp_vect_def (slp_node, j),
4864 e, gimple_phi_arg_location (phi, e->dest_idx));
4865 }
4866
4867 /* Remove scalar call stmts. Do not do this for basic-block
4868 vectorization as not all uses may be vectorized.
4869 ??? Why should this be necessary? DCE should be able to
4870 remove the stmts itself.
4871 ??? For BB vectorization we can as well remove scalar
4872 stmts starting from the SLP tree root if they have no
4873 uses. */
4874 if (is_a <loop_vec_info> (vinfo))
4875 vect_remove_slp_scalar_calls (vinfo, root);
4876
4877 /* Remove vectorized stores original scalar stmts. */
4878 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4879 {
4880 if (!STMT_VINFO_DATA_REF (store_info)
4881 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
4882 break;
4883
4884 store_info = vect_orig_stmt (store_info);
4885 /* Free the attached stmt_vec_info and remove the stmt. */
4886 vinfo->remove_stmt (store_info);
4887 }
4888 }
4889 }