1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
52 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
53 slp_tree
, stmt_vector_for_cost
*);
55 object_allocator
<_slp_tree
> *slp_tree_pool
;
58 _slp_tree::operator new (size_t n
)
60 gcc_assert (n
== sizeof (_slp_tree
));
61 return slp_tree_pool
->allocate_raw ();
65 _slp_tree::operator delete (void *node
, size_t n
)
67 gcc_assert (n
== sizeof (_slp_tree
));
68 slp_tree_pool
->remove_raw (node
);
72 /* Initialize a SLP node. */
74 _slp_tree::_slp_tree ()
76 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
77 SLP_TREE_SCALAR_OPS (this) = vNULL
;
78 SLP_TREE_VEC_STMTS (this) = vNULL
;
79 SLP_TREE_VEC_DEFS (this) = vNULL
;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL
;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
85 SLP_TREE_CODE (this) = ERROR_MARK
;
86 SLP_TREE_VECTYPE (this) = NULL_TREE
;
87 SLP_TREE_REPRESENTATIVE (this) = NULL
;
88 SLP_TREE_REF_COUNT (this) = 1;
93 /* Tear down a SLP node. */
95 _slp_tree::~_slp_tree ()
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
109 vect_free_slp_tree (slp_tree node
)
114 if (--SLP_TREE_REF_COUNT (node
) != 0)
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
119 vect_free_slp_tree (child
);
124 /* Return a location suitable for dumpings related to the SLP instance. */
127 _slp_instance::location () const
130 return root_stmt
->stmt
;
132 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
136 /* Free the memory allocated for the SLP instance. */
139 vect_free_slp_instance (slp_instance instance
)
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
142 SLP_INSTANCE_LOADS (instance
).release ();
143 instance
->subgraph_entries
.release ();
144 instance
->cost_vec
.release ();
149 /* Create an SLP node for SCALAR_STMTS. */
152 vect_create_new_slp_node (slp_tree node
,
153 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
155 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
156 SLP_TREE_CHILDREN (node
).create (nops
);
157 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
158 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
159 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
163 /* Create an SLP node for SCALAR_STMTS. */
166 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
168 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
171 /* Create an SLP node for OPS. */
174 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
176 SLP_TREE_SCALAR_OPS (node
) = ops
;
177 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
178 SLP_TREE_LANES (node
) = ops
.length ();
182 /* Create an SLP node for OPS. */
185 vect_create_new_slp_node (vec
<tree
> ops
)
187 return vect_create_new_slp_node (new _slp_tree
, ops
);
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
194 typedef struct _slp_oprnd_info
196 /* Def-stmts for the operands. */
197 vec
<stmt_vec_info
> def_stmts
;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
204 enum vect_def_type first_dt
;
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
211 static vec
<slp_oprnd_info
>
212 vect_create_oprnd_info (int nops
, int group_size
)
215 slp_oprnd_info oprnd_info
;
216 vec
<slp_oprnd_info
> oprnds_info
;
218 oprnds_info
.create (nops
);
219 for (i
= 0; i
< nops
; i
++)
221 oprnd_info
= XNEW (struct _slp_oprnd_info
);
222 oprnd_info
->def_stmts
.create (group_size
);
223 oprnd_info
->ops
.create (group_size
);
224 oprnd_info
->first_dt
= vect_uninitialized_def
;
225 oprnd_info
->first_op_type
= NULL_TREE
;
226 oprnd_info
->any_pattern
= false;
227 oprnds_info
.quick_push (oprnd_info
);
234 /* Free operands info. */
237 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
240 slp_oprnd_info oprnd_info
;
242 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
244 oprnd_info
->def_stmts
.release ();
245 oprnd_info
->ops
.release ();
246 XDELETE (oprnd_info
);
249 oprnds_info
.release ();
253 /* Return true if STMTS contains a pattern statement. */
256 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
258 stmt_vec_info stmt_info
;
260 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
261 if (is_pattern_stmt_p (stmt_info
))
266 /* Return true when all lanes in the external or constant NODE have
270 vect_slp_tree_uniform_p (slp_tree node
)
272 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
280 tree op
, first
= NULL_TREE
;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
284 else if (!operand_equal_p (first
, op
, 0))
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
296 stmt_vec_info first_stmt_info
)
298 stmt_vec_info next_stmt_info
= first_stmt_info
;
301 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
306 if (next_stmt_info
== stmt_info
)
308 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
310 result
+= DR_GROUP_GAP (next_stmt_info
);
312 while (next_stmt_info
);
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
324 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
325 tree elt_type
, unsigned int *nvectors_out
,
326 tree
*vector_type_out
,
329 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
330 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
333 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
334 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
335 unsigned int nvectors
= 1;
338 scalar_int_mode int_mode
;
339 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
340 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type
= build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode
), 1);
346 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
348 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
350 GET_MODE_SIZE (base_vector_mode
)))
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
356 vec_perm_builder
sel1 (nelts
, 2, 3);
357 vec_perm_builder
sel2 (nelts
, 2, 3);
358 poly_int64 half_nelts
= exact_div (nelts
, 2);
359 for (unsigned int i
= 0; i
< 3; ++i
)
362 sel1
.quick_push (i
+ nelts
);
363 sel2
.quick_push (half_nelts
+ i
);
364 sel2
.quick_push (half_nelts
+ i
+ nelts
);
366 vec_perm_indices
indices1 (sel1
, 2, nelts
);
367 vec_perm_indices
indices2 (sel2
, 2, nelts
);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
372 *nvectors_out
= nvectors
;
374 *vector_type_out
= vector_type
;
377 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
379 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
386 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
392 /* Return true if DTA and DTB match. */
395 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
398 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
399 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
415 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
416 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
417 vec
<slp_oprnd_info
> *oprnds_info
)
419 stmt_vec_info stmt_info
= stmts
[stmt_num
];
421 unsigned int i
, number_of_oprnds
;
422 enum vect_def_type dt
= vect_uninitialized_def
;
423 slp_oprnd_info oprnd_info
;
424 int first_op_idx
= 1;
425 unsigned int commutative_op
= -1U;
426 bool first_op_cond
= false;
427 bool first
= stmt_num
== 0;
429 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
431 number_of_oprnds
= gimple_call_num_args (stmt
);
433 if (gimple_call_internal_p (stmt
))
435 internal_fn ifn
= gimple_call_internal_fn (stmt
);
436 commutative_op
= first_commutative_argument (ifn
);
438 /* Masked load, only look at mask. */
439 if (ifn
== IFN_MASK_LOAD
)
441 number_of_oprnds
= 1;
442 /* Mask operand index. */
447 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
449 enum tree_code code
= gimple_assign_rhs_code (stmt
);
450 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
451 /* Swap can only be done for cond_expr if asked to, otherwise we
452 could result in different comparison code to the first stmt. */
453 if (code
== COND_EXPR
454 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
456 first_op_cond
= true;
460 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
462 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
463 number_of_oprnds
= gimple_phi_num_args (stmt
);
467 bool swapped
= (swap
!= 0);
468 bool backedge
= false;
469 gcc_assert (!swapped
|| first_op_cond
);
470 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
471 for (i
= 0; i
< number_of_oprnds
; i
++)
475 /* Map indicating how operands of cond_expr should be swapped. */
476 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
477 int *map
= maps
[swap
];
480 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
481 first_op_idx
), map
[i
]);
483 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
485 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
487 oprnd
= gimple_phi_arg_def (stmt
, i
);
488 backedge
= dominated_by_p (CDI_DOMINATORS
,
489 gimple_phi_arg_edge (stmt
, i
)->src
,
490 gimple_bb (stmt_info
->stmt
));
493 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
494 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
495 oprnd
= TREE_OPERAND (oprnd
, 0);
497 oprnd_info
= (*oprnds_info
)[i
];
499 stmt_vec_info def_stmt_info
;
500 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
504 "Build SLP failed: can't analyze def for %T\n",
510 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
511 oprnd_info
->any_pattern
= true;
513 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
514 oprnd_info
->ops
.quick_push (oprnd
);
516 /* If there's a extern def on a backedge make sure we can
517 code-generate at the region start.
518 ??? This is another case that could be fixed by adjusting
519 how we split the function but at the moment we'd have conflicting
522 && dts
[i
] == vect_external_def
523 && is_a
<bb_vec_info
> (vinfo
)
524 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
525 && !dominated_by_p (CDI_DOMINATORS
,
526 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
527 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
529 if (dump_enabled_p ())
530 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
531 "Build SLP failed: extern def %T only defined "
532 "on backedge\n", oprnd
);
538 tree type
= TREE_TYPE (oprnd
);
540 if ((dt
== vect_constant_def
541 || dt
== vect_external_def
)
542 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
543 && (TREE_CODE (type
) == BOOLEAN_TYPE
544 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
549 "Build SLP failed: invalid type of def "
550 "for variable-length SLP %T\n", oprnd
);
554 /* For the swapping logic below force vect_reduction_def
555 for the reduction op in a SLP reduction group. */
556 if (!STMT_VINFO_DATA_REF (stmt_info
)
557 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
558 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
560 dts
[i
] = dt
= vect_reduction_def
;
562 /* Check the types of the definition. */
565 case vect_external_def
:
566 case vect_constant_def
:
567 case vect_internal_def
:
568 case vect_reduction_def
:
569 case vect_induction_def
:
570 case vect_nested_cycle
:
574 /* FORNOW: Not supported. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
577 "Build SLP failed: illegal type of def %T\n",
582 oprnd_info
->first_dt
= dt
;
583 oprnd_info
->first_op_type
= type
;
589 /* Now match the operand definition types to that of the first stmt. */
590 for (i
= 0; i
< number_of_oprnds
;)
592 oprnd_info
= (*oprnds_info
)[i
];
594 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
595 oprnd
= oprnd_info
->ops
[stmt_num
];
596 tree type
= TREE_TYPE (oprnd
);
598 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
602 "Build SLP failed: different operand types\n");
606 /* Not first stmt of the group, check that the def-stmt/s match
607 the def-stmt/s of the first stmt. Allow different definition
608 types for reduction chains: the first stmt must be a
609 vect_reduction_def (a phi node), and the rest
610 end in the reduction chain. */
611 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
612 && !(oprnd_info
->first_dt
== vect_reduction_def
613 && !STMT_VINFO_DATA_REF (stmt_info
)
614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
616 && !STMT_VINFO_DATA_REF (def_stmt_info
)
617 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
618 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
619 || (!STMT_VINFO_DATA_REF (stmt_info
)
620 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
622 || STMT_VINFO_DATA_REF (def_stmt_info
)
623 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
624 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
625 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
627 /* Try swapping operands if we got a mismatch. For BB
628 vectorization only in case it will clearly improve things. */
629 if (i
== commutative_op
&& !swapped
630 && (!is_a
<bb_vec_info
> (vinfo
)
631 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
633 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
634 || vect_def_types_match
635 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
637 if (dump_enabled_p ())
638 dump_printf_loc (MSG_NOTE
, vect_location
,
639 "trying swapped operands\n");
640 std::swap (dts
[i
], dts
[i
+1]);
641 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
642 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
643 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
644 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
649 if (is_a
<bb_vec_info
> (vinfo
)
650 && !oprnd_info
->any_pattern
)
652 /* Now for commutative ops we should see whether we can
653 make the other operand matching. */
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
656 "treating operand as external\n");
657 oprnd_info
->first_dt
= dt
= vect_external_def
;
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
663 "Build SLP failed: different types\n");
668 /* Make sure to demote the overall operand to external. */
669 if (dt
== vect_external_def
)
670 oprnd_info
->first_dt
= vect_external_def
;
671 /* For a SLP reduction chain we want to duplicate the reduction to
672 each of the chain members. That gets us a sane SLP graph (still
673 the stmts are not 100% correct wrt the initial values). */
674 else if ((dt
== vect_internal_def
675 || dt
== vect_reduction_def
)
676 && oprnd_info
->first_dt
== vect_reduction_def
677 && !STMT_VINFO_DATA_REF (stmt_info
)
678 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
679 && !STMT_VINFO_DATA_REF (def_stmt_info
)
680 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
681 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
683 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
684 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE
, vect_location
,
695 "swapped operands to match def types in %G",
702 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
703 Return true if we can, meaning that this choice doesn't conflict with
704 existing SLP nodes that use STMT_INFO. */
707 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
709 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
711 return useless_type_conversion_p (vectype
, old_vectype
);
713 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
715 /* We maintain the invariant that if any statement in the group is
716 used, all other members of the group have the same vector type. */
717 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
718 stmt_vec_info member_info
= first_info
;
719 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
720 if (is_pattern_stmt_p (member_info
)
721 && !useless_type_conversion_p (vectype
,
722 STMT_VINFO_VECTYPE (member_info
)))
727 for (member_info
= first_info
; member_info
;
728 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
729 STMT_VINFO_VECTYPE (member_info
) = vectype
;
733 else if (!is_pattern_stmt_p (stmt_info
))
735 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
739 if (dump_enabled_p ())
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
742 "Build SLP failed: incompatible vector"
743 " types for: %G", stmt_info
->stmt
);
744 dump_printf_loc (MSG_NOTE
, vect_location
,
745 " old vector type: %T\n", old_vectype
);
746 dump_printf_loc (MSG_NOTE
, vect_location
,
747 " new vector type: %T\n", vectype
);
752 /* Return true if call statements CALL1 and CALL2 are similar enough
753 to be combined into the same SLP group. */
756 compatible_calls_p (gcall
*call1
, gcall
*call2
)
758 unsigned int nargs
= gimple_call_num_args (call1
);
759 if (nargs
!= gimple_call_num_args (call2
))
762 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
765 if (gimple_call_internal_p (call1
))
767 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
768 TREE_TYPE (gimple_call_lhs (call2
))))
770 for (unsigned int i
= 0; i
< nargs
; ++i
)
771 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
772 TREE_TYPE (gimple_call_arg (call2
, i
))))
777 if (!operand_equal_p (gimple_call_fn (call1
),
778 gimple_call_fn (call2
), 0))
781 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
787 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
788 caller's attempt to find the vector type in STMT_INFO with the narrowest
789 element type. Return true if VECTYPE is nonnull and if it is valid
790 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
791 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
792 vect_build_slp_tree. */
795 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
796 unsigned int group_size
,
797 tree vectype
, poly_uint64
*max_nunits
)
801 if (dump_enabled_p ())
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
803 "Build SLP failed: unsupported data-type in %G\n",
805 /* Fatal mismatch. */
809 /* If populating the vector type requires unrolling then fail
810 before adjusting *max_nunits for basic-block vectorization. */
811 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
812 unsigned HOST_WIDE_INT const_nunits
;
813 if (is_a
<bb_vec_info
> (vinfo
)
814 && (!nunits
.is_constant (&const_nunits
)
815 || const_nunits
> group_size
))
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
819 "Build SLP failed: unrolling required "
820 "in basic block SLP\n");
821 /* Fatal mismatch. */
825 /* In case of multiple types we need to detect the smallest type. */
826 vect_update_max_nunits (max_nunits
, vectype
);
830 /* Verify if the scalar stmts STMTS are isomorphic, require data
831 permutation or are of unsupported types of operation. Return
832 true if they are, otherwise return false and indicate in *MATCHES
833 which stmts are not isomorphic to the first one. If MATCHES[0]
834 is false then this indicates the comparison could not be
835 carried out or the stmts will never be vectorized by SLP.
837 Note COND_EXPR is possibly isomorphic to another one after swapping its
838 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
839 the first stmt by swapping the two operands of comparison; set SWAP[i]
840 to 2 if stmt I is isormorphic to the first stmt by inverting the code
841 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
842 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
845 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
846 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
847 poly_uint64
*max_nunits
, bool *matches
,
848 bool *two_operators
, tree
*node_vectype
)
851 stmt_vec_info first_stmt_info
= stmts
[0];
852 enum tree_code first_stmt_code
= ERROR_MARK
;
853 enum tree_code alt_stmt_code
= ERROR_MARK
;
854 enum tree_code rhs_code
= ERROR_MARK
;
855 enum tree_code first_cond_code
= ERROR_MARK
;
857 bool need_same_oprnds
= false;
858 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
861 machine_mode optab_op2_mode
;
862 machine_mode vec_mode
;
863 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
864 bool first_stmt_load_p
= false, load_p
= false;
865 bool first_stmt_phi_p
= false, phi_p
= false;
867 /* For every stmt in NODE find its def stmt/s. */
868 stmt_vec_info stmt_info
;
869 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
871 gimple
*stmt
= stmt_info
->stmt
;
875 if (dump_enabled_p ())
876 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
878 /* Fail to vectorize statements marked as unvectorizable, throw
880 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
881 || stmt_can_throw_internal (cfun
, stmt
)
882 || gimple_has_volatile_ops (stmt
))
884 if (dump_enabled_p ())
885 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
886 "Build SLP failed: unvectorizable statement %G",
888 /* ??? For BB vectorization we want to commutate operands in a way
889 to shuffle all unvectorizable defs into one operand and have
890 the other still vectorized. The following doesn't reliably
891 work for this though but it's the easiest we can do here. */
892 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
894 /* Fatal mismatch. */
899 lhs
= gimple_get_lhs (stmt
);
900 if (lhs
== NULL_TREE
)
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
904 "Build SLP failed: not GIMPLE_ASSIGN nor "
905 "GIMPLE_CALL %G", stmt
);
906 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
908 /* Fatal mismatch. */
914 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
915 &nunits_vectype
, group_size
)
917 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
918 nunits_vectype
, max_nunits
)))
920 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
922 /* Fatal mismatch. */
927 gcc_assert (vectype
);
929 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
932 rhs_code
= CALL_EXPR
;
934 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
936 else if ((gimple_call_internal_p (call_stmt
)
937 && (!vectorizable_internal_fn_p
938 (gimple_call_internal_fn (call_stmt
))))
939 || gimple_call_tail_p (call_stmt
)
940 || gimple_call_noreturn_p (call_stmt
)
941 || !gimple_call_nothrow_p (call_stmt
)
942 || gimple_call_chain (call_stmt
))
944 if (dump_enabled_p ())
945 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
946 "Build SLP failed: unsupported call type %G",
948 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
950 /* Fatal mismatch. */
955 else if (gimple_code (stmt
) == GIMPLE_PHI
)
957 rhs_code
= ERROR_MARK
;
962 rhs_code
= gimple_assign_rhs_code (stmt
);
963 load_p
= gimple_vuse (stmt
);
966 /* Check the operation. */
969 *node_vectype
= vectype
;
970 first_stmt_code
= rhs_code
;
971 first_stmt_load_p
= load_p
;
972 first_stmt_phi_p
= phi_p
;
974 /* Shift arguments should be equal in all the packed stmts for a
975 vector shift with scalar shift operand. */
976 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
977 || rhs_code
== LROTATE_EXPR
978 || rhs_code
== RROTATE_EXPR
)
980 vec_mode
= TYPE_MODE (vectype
);
982 /* First see if we have a vector/vector shift. */
983 optab
= optab_for_tree_code (rhs_code
, vectype
,
987 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
989 /* No vector/vector shift, try for a vector/scalar shift. */
990 optab
= optab_for_tree_code (rhs_code
, vectype
,
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
997 "Build SLP failed: no optab.\n");
998 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1000 /* Fatal mismatch. */
1004 icode
= (int) optab_handler (optab
, vec_mode
);
1005 if (icode
== CODE_FOR_nothing
)
1007 if (dump_enabled_p ())
1008 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1009 "Build SLP failed: "
1010 "op not supported by target.\n");
1011 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1013 /* Fatal mismatch. */
1017 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1018 if (!VECTOR_MODE_P (optab_op2_mode
))
1020 need_same_oprnds
= true;
1021 first_op1
= gimple_assign_rhs2 (stmt
);
1025 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1027 need_same_oprnds
= true;
1028 first_op1
= gimple_assign_rhs2 (stmt
);
1031 && rhs_code
== BIT_FIELD_REF
)
1033 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1034 if (TREE_CODE (vec
) != SSA_NAME
1035 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
1037 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1039 if (dump_enabled_p ())
1040 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1041 "Build SLP failed: "
1042 "BIT_FIELD_REF not supported\n");
1043 /* Fatal mismatch. */
1049 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1051 need_same_oprnds
= true;
1052 first_op1
= gimple_call_arg (call_stmt
, 1);
1057 if (first_stmt_code
!= rhs_code
1058 && alt_stmt_code
== ERROR_MARK
)
1059 alt_stmt_code
= rhs_code
;
1060 if ((first_stmt_code
!= rhs_code
1061 && (first_stmt_code
!= IMAGPART_EXPR
1062 || rhs_code
!= REALPART_EXPR
)
1063 && (first_stmt_code
!= REALPART_EXPR
1064 || rhs_code
!= IMAGPART_EXPR
)
1065 /* Handle mismatches in plus/minus by computing both
1066 and merging the results. */
1067 && !((first_stmt_code
== PLUS_EXPR
1068 || first_stmt_code
== MINUS_EXPR
)
1069 && (alt_stmt_code
== PLUS_EXPR
1070 || alt_stmt_code
== MINUS_EXPR
)
1071 && rhs_code
== alt_stmt_code
)
1072 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1073 && (first_stmt_code
== ARRAY_REF
1074 || first_stmt_code
== BIT_FIELD_REF
1075 || first_stmt_code
== INDIRECT_REF
1076 || first_stmt_code
== COMPONENT_REF
1077 || first_stmt_code
== MEM_REF
)))
1078 || first_stmt_load_p
!= load_p
1079 || first_stmt_phi_p
!= phi_p
)
1081 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1084 "Build SLP failed: different operation "
1085 "in stmt %G", stmt
);
1086 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1087 "original stmt %G", first_stmt_info
->stmt
);
1093 if (need_same_oprnds
)
1095 tree other_op1
= (call_stmt
1096 ? gimple_call_arg (call_stmt
, 1)
1097 : gimple_assign_rhs2 (stmt
));
1098 if (!operand_equal_p (first_op1
, other_op1
, 0))
1100 if (dump_enabled_p ())
1101 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1102 "Build SLP failed: different shift "
1103 "arguments in %G", stmt
);
1109 && first_stmt_code
== BIT_FIELD_REF
1110 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1111 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1115 "Build SLP failed: different BIT_FIELD_REF "
1116 "arguments in %G", stmt
);
1121 if (!load_p
&& rhs_code
== CALL_EXPR
)
1123 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1124 as_a
<gcall
*> (stmt
)))
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1128 "Build SLP failed: different calls in %G",
1136 && (gimple_bb (first_stmt_info
->stmt
)
1137 != gimple_bb (stmt_info
->stmt
)))
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1141 "Build SLP failed: different BB for PHI "
1147 if (!types_compatible_p (vectype
, *node_vectype
))
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1151 "Build SLP failed: different vector type "
1158 /* Grouped store or load. */
1159 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1161 if (REFERENCE_CLASS_P (lhs
))
1169 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1170 if (prev_first_load
)
1172 /* Check that there are no loads from different interleaving
1173 chains in the same node. */
1174 if (prev_first_load
!= first_load
)
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1179 "Build SLP failed: different "
1180 "interleaving chains in one node %G",
1187 prev_first_load
= first_load
;
1189 } /* Grouped access. */
1194 /* Not grouped load. */
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1197 "Build SLP failed: not grouped load %G", stmt
);
1199 /* FORNOW: Not grouped loads are not supported. */
1200 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1202 /* Fatal mismatch. */
1207 /* Not memory operation. */
1209 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1210 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1211 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1212 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1213 && rhs_code
!= VIEW_CONVERT_EXPR
1214 && rhs_code
!= CALL_EXPR
1215 && rhs_code
!= BIT_FIELD_REF
)
1217 if (dump_enabled_p ())
1218 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1219 "Build SLP failed: operation unsupported %G",
1221 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1223 /* Fatal mismatch. */
1228 if (rhs_code
== COND_EXPR
)
1230 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1231 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1232 enum tree_code swap_code
= ERROR_MARK
;
1233 enum tree_code invert_code
= ERROR_MARK
;
1236 first_cond_code
= TREE_CODE (cond_expr
);
1237 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1239 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1240 swap_code
= swap_tree_comparison (cond_code
);
1241 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1244 if (first_cond_code
== cond_code
)
1246 /* Isomorphic can be achieved by swapping. */
1247 else if (first_cond_code
== swap_code
)
1249 /* Isomorphic can be achieved by inverting. */
1250 else if (first_cond_code
== invert_code
)
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1256 "Build SLP failed: different"
1257 " operation %G", stmt
);
1267 for (i
= 0; i
< group_size
; ++i
)
1271 /* If we allowed a two-operation SLP node verify the target can cope
1272 with the permute we are going to use. */
1273 if (alt_stmt_code
!= ERROR_MARK
1274 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1276 *two_operators
= true;
1282 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1283 Note we never remove apart from at destruction time so we do not
1284 need a special value for deleted that differs from empty. */
1287 typedef vec
<stmt_vec_info
> value_type
;
1288 typedef vec
<stmt_vec_info
> compare_type
;
1289 static inline hashval_t
hash (value_type
);
1290 static inline bool equal (value_type existing
, value_type candidate
);
1291 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1292 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1293 static const bool empty_zero_p
= true;
1294 static inline void mark_empty (value_type
&x
) { x
.release (); }
1295 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1296 static inline void remove (value_type
&x
) { x
.release (); }
1299 bst_traits::hash (value_type x
)
1302 for (unsigned i
= 0; i
< x
.length (); ++i
)
1303 h
.add_int (gimple_uid (x
[i
]->stmt
));
1307 bst_traits::equal (value_type existing
, value_type candidate
)
1309 if (existing
.length () != candidate
.length ())
1311 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1312 if (existing
[i
] != candidate
[i
])
1317 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1318 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1319 scalar_stmts_to_slp_tree_map_t
;
1322 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1323 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1324 poly_uint64
*max_nunits
,
1325 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1326 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1329 vect_build_slp_tree (vec_info
*vinfo
,
1330 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1331 poly_uint64
*max_nunits
,
1332 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1333 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1335 if (slp_tree
*leader
= bst_map
->get (stmts
))
1337 if (dump_enabled_p ())
1338 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1339 *leader
? "" : "failed ", *leader
);
1342 SLP_TREE_REF_COUNT (*leader
)++;
1343 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1348 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1349 so we can pick up backedge destinations during discovery. */
1350 slp_tree res
= new _slp_tree
;
1351 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1352 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1353 bst_map
->put (stmts
.copy (), res
);
1355 poly_uint64 this_max_nunits
= 1;
1356 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1358 matches
, npermutes
, tree_size
, bst_map
);
1361 bool existed_p
= bst_map
->put (stmts
, NULL
);
1362 gcc_assert (existed_p
);
1363 /* Mark the node invalid so we can detect those when still in use
1364 as backedge destinations. */
1365 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1366 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1367 vect_free_slp_tree (res
);
1371 gcc_assert (res_
== res
);
1372 res
->max_nunits
= this_max_nunits
;
1373 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1374 /* Keep a reference for the bst_map use. */
1375 SLP_TREE_REF_COUNT (res
)++;
1380 /* Recursively build an SLP tree starting from NODE.
1381 Fail (and return a value not equal to zero) if def-stmts are not
1382 isomorphic, require data permutation or are of unsupported types of
1383 operation. Otherwise, return 0.
1384 The value returned is the depth in the SLP tree where a mismatch
1388 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1389 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1390 poly_uint64
*max_nunits
,
1391 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1392 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1394 unsigned nops
, i
, this_tree_size
= 0;
1395 poly_uint64 this_max_nunits
= *max_nunits
;
1399 stmt_vec_info stmt_info
= stmts
[0];
1400 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1401 nops
= gimple_call_num_args (stmt
);
1402 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1404 nops
= gimple_num_ops (stmt
) - 1;
1405 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1408 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1409 nops
= gimple_phi_num_args (phi
);
1413 /* If the SLP node is a PHI (induction or reduction), terminate
1415 bool skip_args
[2] = { false, false };
1416 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1417 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1419 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1420 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1422 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1426 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1427 /* Induction from different IVs is not supported. */
1428 if (def_type
== vect_induction_def
)
1430 stmt_vec_info other_info
;
1431 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1432 if (stmt_info
!= other_info
)
1435 /* Induction PHIs are leafs. */
1437 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1438 SLP_TREE_VECTYPE (node
) = vectype
;
1439 SLP_TREE_CHILDREN (node
).quick_grow_cleared (nops
);
1442 else if (def_type
== vect_reduction_def
1443 || def_type
== vect_double_reduction_def
1444 || def_type
== vect_nested_cycle
)
1446 /* Else def types have to match. */
1447 stmt_vec_info other_info
;
1448 bool all_same
= true;
1449 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1451 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1453 if (other_info
!= stmt_info
)
1456 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1457 /* Reduction initial values are not explicitely represented. */
1458 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1459 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1460 /* Reduction chain backedge defs are filled manually.
1461 ??? Need a better way to identify a SLP reduction chain PHI.
1462 Or a better overall way to SLP match those. */
1463 if (all_same
&& def_type
== vect_reduction_def
)
1464 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1466 else if (def_type
!= vect_internal_def
)
1471 bool two_operators
= false;
1472 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1473 tree vectype
= NULL_TREE
;
1474 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1475 &this_max_nunits
, matches
, &two_operators
,
1479 /* If the SLP node is a load, terminate the recursion unless masked. */
1480 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1481 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1483 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1486 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1491 *max_nunits
= this_max_nunits
;
1493 node
= vect_create_new_slp_node (node
, stmts
, 0);
1494 SLP_TREE_VECTYPE (node
) = vectype
;
1495 /* And compute the load permutation. Whether it is actually
1496 a permutation depends on the unrolling factor which is
1498 vec
<unsigned> load_permutation
;
1500 stmt_vec_info load_info
;
1501 load_permutation
.create (group_size
);
1502 stmt_vec_info first_stmt_info
1503 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1504 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1506 int load_place
= vect_get_place_in_interleaving_chain
1507 (load_info
, first_stmt_info
);
1508 gcc_assert (load_place
!= -1);
1509 load_permutation
.safe_push (load_place
);
1511 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1515 else if (gimple_assign_single_p (stmt_info
->stmt
)
1516 && !gimple_vuse (stmt_info
->stmt
)
1517 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1519 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1520 the same SSA name vector of a compatible type to vectype. */
1521 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1522 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1523 stmt_vec_info estmt_info
;
1524 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1526 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1527 tree bfref
= gimple_assign_rhs1 (estmt
);
1529 if (!known_eq (bit_field_size (bfref
),
1530 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1531 || !constant_multiple_p (bit_field_offset (bfref
),
1532 bit_field_size (bfref
), &lane
))
1537 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1539 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1540 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1541 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1542 /* We are always building a permutation node even if it is an identity
1543 permute to shield the rest of the vectorizer from the odd node
1544 representing an actual vector without any scalar ops.
1545 ??? We could hide it completely with making the permute node
1547 node
= vect_create_new_slp_node (node
, stmts
, 1);
1548 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1549 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1550 SLP_TREE_VECTYPE (node
) = vectype
;
1551 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1555 /* Get at the operands, verifying they are compatible. */
1556 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1557 slp_oprnd_info oprnd_info
;
1558 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1560 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
],
1561 stmts
, i
, &oprnds_info
);
1563 matches
[(res
== -1) ? 0 : i
] = false;
1567 for (i
= 0; i
< group_size
; ++i
)
1570 vect_free_oprnd_info (oprnds_info
);
1575 auto_vec
<slp_tree
, 4> children
;
1577 stmt_info
= stmts
[0];
1579 /* Create SLP_TREE nodes for the definition node/s. */
1580 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1585 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1587 /* COND_EXPR have one too many eventually if the condition
1589 gcc_assert (i
== 3 && nops
== 4);
1593 /* We're skipping certain operands from processing, for example
1594 outer loop reduction initial defs. */
1595 if (i
<= 1 && skip_args
[i
])
1597 children
.safe_push (NULL
);
1601 if (is_a
<bb_vec_info
> (vinfo
)
1602 && oprnd_info
->first_dt
== vect_internal_def
)
1604 /* For BB vectorization, if all defs are the same do not
1605 bother to continue the build along the single-lane
1606 graph but use a splat of the scalar value. */
1607 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1608 for (j
= 1; j
< group_size
; ++j
)
1609 if (oprnd_info
->def_stmts
[j
] != first_def
)
1612 /* But avoid doing this for loads where we may be
1613 able to CSE things. */
1614 && !gimple_vuse (first_def
->stmt
))
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_NOTE
, vect_location
,
1618 "Using a splat of the uniform operand\n");
1619 oprnd_info
->first_dt
= vect_external_def
;
1623 if (oprnd_info
->first_dt
== vect_external_def
1624 || oprnd_info
->first_dt
== vect_constant_def
)
1626 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1627 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1628 oprnd_info
->ops
= vNULL
;
1629 children
.safe_push (invnode
);
1633 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1634 group_size
, &this_max_nunits
,
1636 &this_tree_size
, bst_map
)) != NULL
)
1638 oprnd_info
->def_stmts
= vNULL
;
1639 children
.safe_push (child
);
1643 /* If the SLP build for operand zero failed and operand zero
1644 and one can be commutated try that for the scalar stmts
1645 that failed the match. */
1647 /* A first scalar stmt mismatch signals a fatal mismatch. */
1649 /* ??? For COND_EXPRs we can swap the comparison operands
1650 as well as the arms under some constraints. */
1652 && oprnds_info
[1]->first_dt
== vect_internal_def
1653 && is_gimple_assign (stmt_info
->stmt
)
1654 /* Swapping operands for reductions breaks assumptions later on. */
1655 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1656 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1657 /* Do so only if the number of not successful permutes was nor more
1658 than a cut-ff as re-trying the recursive match on
1659 possibly each level of the tree would expose exponential
1663 /* See whether we can swap the matching or the non-matching
1665 bool swap_not_matching
= true;
1668 for (j
= 0; j
< group_size
; ++j
)
1670 if (matches
[j
] != !swap_not_matching
)
1672 stmt_vec_info stmt_info
= stmts
[j
];
1673 /* Verify if we can swap operands of this stmt. */
1674 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1676 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1678 if (!swap_not_matching
)
1680 swap_not_matching
= false;
1685 while (j
!= group_size
);
1687 /* Swap mismatched definition stmts. */
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_NOTE
, vect_location
,
1690 "Re-trying with swapped operands of stmts ");
1691 for (j
= 0; j
< group_size
; ++j
)
1692 if (matches
[j
] == !swap_not_matching
)
1694 std::swap (oprnds_info
[0]->def_stmts
[j
],
1695 oprnds_info
[1]->def_stmts
[j
]);
1696 std::swap (oprnds_info
[0]->ops
[j
],
1697 oprnds_info
[1]->ops
[j
]);
1698 if (dump_enabled_p ())
1699 dump_printf (MSG_NOTE
, "%d ", j
);
1701 if (dump_enabled_p ())
1702 dump_printf (MSG_NOTE
, "\n");
1703 /* And try again with scratch 'matches' ... */
1704 bool *tem
= XALLOCAVEC (bool, group_size
);
1705 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1706 group_size
, &this_max_nunits
,
1708 &this_tree_size
, bst_map
)) != NULL
)
1710 oprnd_info
->def_stmts
= vNULL
;
1711 children
.safe_push (child
);
1714 /* We do not undo the swapping here since it might still be
1715 the better order for the second operand in case we build
1716 the first one from scalars below. */
1721 /* If the SLP build failed and we analyze a basic-block
1722 simply treat nodes we fail to build as externally defined
1723 (and thus build vectors from the scalar defs).
1724 The cost model will reject outright expensive cases.
1725 ??? This doesn't treat cases where permutation ultimatively
1726 fails (or we don't try permutation below). Ideally we'd
1727 even compute a permutation that will end up with the maximum
1729 if (is_a
<bb_vec_info
> (vinfo
)
1730 /* ??? Rejecting patterns this way doesn't work. We'd have to
1731 do extra work to cancel the pattern so the uses see the
1733 && !is_pattern_stmt_p (stmt_info
)
1734 && !oprnd_info
->any_pattern
)
1736 /* But if there's a leading vector sized set of matching stmts
1737 fail here so we can split the group. This matches the condition
1738 vect_analyze_slp_instance uses. */
1739 /* ??? We might want to split here and combine the results to support
1740 multiple vector sizes better. */
1741 for (j
= 0; j
< group_size
; ++j
)
1744 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE
, vect_location
,
1748 "Building vector operands from scalars\n");
1750 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1751 children
.safe_push (child
);
1752 oprnd_info
->ops
= vNULL
;
1757 gcc_assert (child
== NULL
);
1758 FOR_EACH_VEC_ELT (children
, j
, child
)
1760 vect_free_slp_tree (child
);
1761 vect_free_oprnd_info (oprnds_info
);
1765 vect_free_oprnd_info (oprnds_info
);
1767 /* If we have all children of a child built up from uniform scalars
1768 or does more than one possibly expensive vector construction then
1769 just throw that away, causing it built up from scalars.
1770 The exception is the SLP node for the vector store. */
1771 if (is_a
<bb_vec_info
> (vinfo
)
1772 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1773 /* ??? Rejecting patterns this way doesn't work. We'd have to
1774 do extra work to cancel the pattern so the uses see the
1776 && !is_pattern_stmt_p (stmt_info
))
1780 bool all_uniform_p
= true;
1781 unsigned n_vector_builds
= 0;
1782 FOR_EACH_VEC_ELT (children
, j
, child
)
1786 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1787 all_uniform_p
= false;
1788 else if (!vect_slp_tree_uniform_p (child
))
1790 all_uniform_p
= false;
1791 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1795 if (all_uniform_p
|| n_vector_builds
> 1)
1799 FOR_EACH_VEC_ELT (children
, j
, child
)
1801 vect_free_slp_tree (child
);
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_NOTE
, vect_location
,
1805 "Building parent vector operands from "
1806 "scalars instead\n");
1811 *tree_size
+= this_tree_size
+ 1;
1812 *max_nunits
= this_max_nunits
;
1816 /* ??? We'd likely want to either cache in bst_map sth like
1817 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1818 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1819 explicit stmts to put in so the keying on 'stmts' doesn't
1820 work (but we have the same issue with nodes that use 'ops'). */
1821 slp_tree one
= new _slp_tree
;
1822 slp_tree two
= new _slp_tree
;
1823 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1824 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1825 SLP_TREE_VECTYPE (one
) = vectype
;
1826 SLP_TREE_VECTYPE (two
) = vectype
;
1827 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1828 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1830 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1831 SLP_TREE_REF_COUNT (child
)++;
1833 /* Here we record the original defs since this
1834 node represents the final lane configuration. */
1835 node
= vect_create_new_slp_node (node
, stmts
, 2);
1836 SLP_TREE_VECTYPE (node
) = vectype
;
1837 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1838 SLP_TREE_CHILDREN (node
).quick_push (one
);
1839 SLP_TREE_CHILDREN (node
).quick_push (two
);
1840 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1841 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1842 enum tree_code ocode
= ERROR_MARK
;
1843 stmt_vec_info ostmt_info
;
1845 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1847 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1848 if (gimple_assign_rhs_code (ostmt
) != code0
)
1850 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1851 ocode
= gimple_assign_rhs_code (ostmt
);
1855 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1857 SLP_TREE_CODE (one
) = code0
;
1858 SLP_TREE_CODE (two
) = ocode
;
1859 SLP_TREE_LANES (one
) = stmts
.length ();
1860 SLP_TREE_LANES (two
) = stmts
.length ();
1861 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1862 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1866 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1867 SLP_TREE_VECTYPE (node
) = vectype
;
1868 SLP_TREE_CHILDREN (node
).splice (children
);
1872 /* Dump a single SLP tree NODE. */
1875 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1880 stmt_vec_info stmt_info
;
1883 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1884 dump_user_location_t user_loc
= loc
.get_user_location ();
1885 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1886 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1888 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1891 estimated_poly_value (node
->max_nunits
),
1892 SLP_TREE_REF_COUNT (node
));
1893 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1894 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1895 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1898 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1899 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1900 dump_printf (metadata
, "%T%s ", op
,
1901 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1902 dump_printf (metadata
, "}\n");
1904 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1906 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1907 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1908 dump_printf (dump_kind
, " %u", j
);
1909 dump_printf (dump_kind
, " }\n");
1911 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1913 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1914 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1915 dump_printf (dump_kind
, " %u[%u]",
1916 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1917 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1918 dump_printf (dump_kind
, " }\n");
1920 if (SLP_TREE_CHILDREN (node
).is_empty ())
1922 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1924 dump_printf (dump_kind
, " %p", (void *)child
);
1925 dump_printf (dump_kind
, "\n");
1929 debug (slp_tree node
)
1931 debug_dump_context ctx
;
1932 vect_print_slp_tree (MSG_NOTE
,
1933 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1937 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1940 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1941 slp_tree node
, hash_set
<slp_tree
> &visited
)
1946 if (visited
.add (node
))
1949 vect_print_slp_tree (dump_kind
, loc
, node
);
1951 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1953 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1957 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1960 hash_set
<slp_tree
> visited
;
1961 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1964 /* Mark the tree rooted at NODE with PURE_SLP. */
1967 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1970 stmt_vec_info stmt_info
;
1973 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1976 if (visited
.add (node
))
1979 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1980 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
1982 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1984 vect_mark_slp_stmts (child
, visited
);
1988 vect_mark_slp_stmts (slp_tree node
)
1990 hash_set
<slp_tree
> visited
;
1991 vect_mark_slp_stmts (node
, visited
);
1994 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1997 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2000 stmt_vec_info stmt_info
;
2003 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2006 if (visited
.add (node
))
2009 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2011 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2012 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2013 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2016 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2018 vect_mark_slp_stmts_relevant (child
, visited
);
2022 vect_mark_slp_stmts_relevant (slp_tree node
)
2024 hash_set
<slp_tree
> visited
;
2025 vect_mark_slp_stmts_relevant (node
, visited
);
2029 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2032 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2033 hash_set
<slp_tree
> &visited
)
2035 if (!node
|| visited
.add (node
))
2038 if (SLP_TREE_CHILDREN (node
).length () == 0)
2040 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2042 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2043 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2044 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2045 loads
.safe_push (node
);
2051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2052 vect_gather_slp_loads (loads
, child
, visited
);
2057 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
2059 hash_set
<slp_tree
> visited
;
2060 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
2064 /* Find the last store in SLP INSTANCE. */
2067 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2069 stmt_vec_info last
= NULL
;
2070 stmt_vec_info stmt_vinfo
;
2072 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2074 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2075 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2081 /* Find the first stmt in NODE. */
2084 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2086 stmt_vec_info first
= NULL
;
2087 stmt_vec_info stmt_vinfo
;
2089 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2091 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2093 || get_later_stmt (stmt_vinfo
, first
) == first
)
2100 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2101 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2102 (also containing the first GROUP1_SIZE stmts, since stores are
2103 consecutive), the second containing the remainder.
2104 Return the first stmt in the second group. */
2106 static stmt_vec_info
2107 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2109 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2110 gcc_assert (group1_size
> 0);
2111 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2112 gcc_assert (group2_size
> 0);
2113 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2115 stmt_vec_info stmt_info
= first_vinfo
;
2116 for (unsigned i
= group1_size
; i
> 1; i
--)
2118 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2119 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2121 /* STMT is now the last element of the first group. */
2122 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2123 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2125 DR_GROUP_SIZE (group2
) = group2_size
;
2126 for (stmt_info
= group2
; stmt_info
;
2127 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2129 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2130 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2133 /* For the second group, the DR_GROUP_GAP is that before the original group,
2134 plus skipping over the first vector. */
2135 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2137 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2138 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2142 group1_size
, group2_size
);
2147 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2148 statements and a vector of NUNITS elements. */
2151 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2153 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2156 enum slp_instance_kind
{
2157 slp_inst_kind_store
,
2158 slp_inst_kind_reduc_group
,
2159 slp_inst_kind_reduc_chain
,
2164 vect_analyze_slp_instance (vec_info
*vinfo
,
2165 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2166 stmt_vec_info stmt_info
, unsigned max_tree_size
);
2168 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2169 of KIND. Return true if successful. */
2172 vect_build_slp_instance (vec_info
*vinfo
,
2173 slp_instance_kind kind
,
2174 vec
<stmt_vec_info
> scalar_stmts
,
2175 stmt_vec_info root_stmt_info
,
2176 unsigned max_tree_size
,
2177 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2178 /* ??? We need stmt_info for group splitting. */
2179 stmt_vec_info stmt_info_
)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE
, vect_location
,
2184 "Starting SLP discovery for\n");
2185 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2186 dump_printf_loc (MSG_NOTE
, vect_location
,
2187 " %G", scalar_stmts
[i
]->stmt
);
2190 /* Build the tree for the SLP instance. */
2191 unsigned int group_size
= scalar_stmts
.length ();
2192 bool *matches
= XALLOCAVEC (bool, group_size
);
2193 unsigned npermutes
= 0;
2194 poly_uint64 max_nunits
= 1;
2195 unsigned tree_size
= 0;
2197 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2198 &max_nunits
, matches
, &npermutes
,
2199 &tree_size
, bst_map
);
2202 /* Calculate the unrolling factor based on the smallest type. */
2203 poly_uint64 unrolling_factor
2204 = calculate_unrolling_factor (max_nunits
, group_size
);
2206 if (maybe_ne (unrolling_factor
, 1U)
2207 && is_a
<bb_vec_info
> (vinfo
))
2209 unsigned HOST_WIDE_INT const_max_nunits
;
2210 if (!max_nunits
.is_constant (&const_max_nunits
)
2211 || const_max_nunits
> group_size
)
2213 if (dump_enabled_p ())
2214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2215 "Build SLP failed: store group "
2216 "size not a multiple of the vector size "
2217 "in basic block SLP\n");
2218 vect_free_slp_tree (node
);
2221 /* Fatal mismatch. */
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE
, vect_location
,
2224 "SLP discovery succeeded but node needs "
2226 memset (matches
, true, group_size
);
2227 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2228 vect_free_slp_tree (node
);
2232 /* Create a new SLP instance. */
2233 slp_instance new_instance
= XNEW (class _slp_instance
);
2234 SLP_INSTANCE_TREE (new_instance
) = node
;
2235 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2236 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2237 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2238 new_instance
->reduc_phis
= NULL
;
2239 new_instance
->cost_vec
= vNULL
;
2240 new_instance
->subgraph_entries
= vNULL
;
2242 vect_gather_slp_loads (new_instance
, node
);
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_NOTE
, vect_location
,
2245 "SLP size %u vs. limit %u.\n",
2246 tree_size
, max_tree_size
);
2248 /* Check whether any load is possibly permuted. */
2250 bool loads_permuted
= false;
2251 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2253 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2256 stmt_vec_info load_info
;
2257 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2258 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2260 loads_permuted
= true;
2265 /* If the loads and stores can be handled with load/store-lane
2266 instructions do not generate this SLP instance. */
2267 if (is_a
<loop_vec_info
> (vinfo
)
2269 && kind
== slp_inst_kind_store
2270 && vect_store_lanes_supported
2271 (STMT_VINFO_VECTYPE (scalar_stmts
[0]), group_size
, false))
2274 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2276 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2277 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2278 /* Use SLP for strided accesses (or if we can't load-lanes). */
2279 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2280 || ! vect_load_lanes_supported
2281 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2282 DR_GROUP_SIZE (stmt_vinfo
), false))
2285 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2289 "Built SLP cancelled: can use "
2290 "load/store-lanes\n");
2291 vect_free_slp_instance (new_instance
);
2296 /* Fixup SLP reduction chains. */
2297 if (kind
== slp_inst_kind_reduc_chain
)
2299 /* If this is a reduction chain with a conversion in front
2300 amend the SLP tree with a node for that. */
2302 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2303 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2305 /* Get at the conversion stmt - we know it's the single use
2306 of the last stmt of the reduction chain. */
2307 use_operand_p use_p
;
2308 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2309 &use_p
, &scalar_def
);
2311 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2312 next_info
= vect_stmt_to_vectorize (next_info
);
2313 scalar_stmts
= vNULL
;
2314 scalar_stmts
.create (group_size
);
2315 for (unsigned i
= 0; i
< group_size
; ++i
)
2316 scalar_stmts
.quick_push (next_info
);
2317 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2318 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2319 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2320 SLP_INSTANCE_TREE (new_instance
) = conv
;
2321 /* We also have to fake this conversion stmt as SLP reduction
2322 group so we don't have to mess with too much code
2324 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2325 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2327 /* Fill the backedge child of the PHI SLP node. The
2328 general matching code cannot find it because the
2329 scalar code does not reflect how we vectorize the
2331 use_operand_p use_p
;
2332 imm_use_iterator imm_iter
;
2333 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2334 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2335 gimple_get_lhs (scalar_def
))
2336 /* There are exactly two non-debug uses, the reduction
2337 PHI and the loop-closed PHI node. */
2338 if (!is_gimple_debug (USE_STMT (use_p
))
2339 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2341 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2342 stmt_vec_info phi_info
2343 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2344 for (unsigned i
= 0; i
< group_size
; ++i
)
2345 phis
.quick_push (phi_info
);
2346 slp_tree
*phi_node
= bst_map
->get (phis
);
2347 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2348 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2349 = SLP_INSTANCE_TREE (new_instance
);
2350 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2354 vinfo
->slp_instances
.safe_push (new_instance
);
2356 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2357 the number of scalar stmts in the root in a few places.
2358 Verify that assumption holds. */
2359 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2360 .length () == group_size
);
2362 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE
, vect_location
,
2365 "Final SLP tree for instance:\n");
2366 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2367 SLP_INSTANCE_TREE (new_instance
));
2375 /* Failed to SLP. */
2376 /* Free the allocated memory. */
2377 scalar_stmts
.release ();
2380 stmt_vec_info stmt_info
= stmt_info_
;
2381 /* Try to break the group up into pieces. */
2382 if (kind
== slp_inst_kind_store
)
2384 /* ??? We could delay all the actual splitting of store-groups
2385 until after SLP discovery of the original group completed.
2386 Then we can recurse to vect_build_slp_instance directly. */
2387 for (i
= 0; i
< group_size
; i
++)
2391 /* For basic block SLP, try to break the group up into multiples of
2393 if (is_a
<bb_vec_info
> (vinfo
)
2394 && (i
> 1 && i
< group_size
))
2397 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2398 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2399 1 << floor_log2 (i
));
2400 unsigned HOST_WIDE_INT const_nunits
;
2402 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2404 /* Split into two groups at the first vector boundary. */
2405 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2406 unsigned group1_size
= i
& ~(const_nunits
- 1);
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE
, vect_location
,
2410 "Splitting SLP group at stmt %u\n", i
);
2411 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2413 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2415 /* Split the rest at the failure point and possibly
2416 re-analyze the remaining matching part if it has
2417 at least two lanes. */
2419 && (i
+ 1 < group_size
2420 || i
- group1_size
> 1))
2422 stmt_vec_info rest2
= rest
;
2423 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2424 if (i
- group1_size
> 1)
2425 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2426 rest2
, max_tree_size
);
2428 /* Re-analyze the non-matching tail if it has at least
2430 if (i
+ 1 < group_size
)
2431 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2432 rest
, max_tree_size
);
2437 /* For loop vectorization split into arbitrary pieces of size > 1. */
2438 if (is_a
<loop_vec_info
> (vinfo
)
2439 && (i
> 1 && i
< group_size
))
2441 unsigned group1_size
= i
;
2443 if (dump_enabled_p ())
2444 dump_printf_loc (MSG_NOTE
, vect_location
,
2445 "Splitting SLP group at stmt %u\n", i
);
2447 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2449 /* Loop vectorization cannot handle gaps in stores, make sure
2450 the split group appears as strided. */
2451 STMT_VINFO_STRIDED_P (rest
) = 1;
2452 DR_GROUP_GAP (rest
) = 0;
2453 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2454 DR_GROUP_GAP (stmt_info
) = 0;
2456 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2458 if (i
+ 1 < group_size
)
2459 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2460 rest
, max_tree_size
);
2465 /* Even though the first vector did not all match, we might be able to SLP
2466 (some) of the remainder. FORNOW ignore this possibility. */
2469 /* Failed to SLP. */
2470 if (dump_enabled_p ())
2471 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2476 /* Analyze an SLP instance starting from a group of grouped stores. Call
2477 vect_build_slp_tree to build a tree of packed stmts if possible.
2478 Return FALSE if it's impossible to SLP any stmt in the loop. */
2481 vect_analyze_slp_instance (vec_info
*vinfo
,
2482 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2483 stmt_vec_info stmt_info
, unsigned max_tree_size
)
2485 unsigned int group_size
;
2487 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2488 vec
<stmt_vec_info
> scalar_stmts
;
2489 slp_instance_kind kind
;
2491 if (is_a
<bb_vec_info
> (vinfo
))
2492 vect_location
= stmt_info
->stmt
;
2493 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2495 kind
= slp_inst_kind_store
;
2496 group_size
= DR_GROUP_SIZE (stmt_info
);
2498 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2500 kind
= slp_inst_kind_reduc_chain
;
2501 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2502 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2504 else if (is_gimple_assign (stmt_info
->stmt
)
2505 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2507 kind
= slp_inst_kind_ctor
;
2508 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2512 kind
= slp_inst_kind_reduc_group
;
2513 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2514 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2517 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2518 scalar_stmts
.create (group_size
);
2519 stmt_vec_info next_info
= stmt_info
;
2520 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2522 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2525 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2526 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2529 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2531 /* Collect the reduction stmts and store them in
2532 SLP_TREE_SCALAR_STMTS. */
2535 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2536 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2538 /* Mark the first element of the reduction chain as reduction to properly
2539 transform the node. In the reduction analysis phase only the last
2540 element of the chain is marked as reduction. */
2541 STMT_VINFO_DEF_TYPE (stmt_info
)
2542 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2543 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2544 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2546 else if (kind
== slp_inst_kind_ctor
)
2548 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2550 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2552 if (TREE_CODE (val
) == SSA_NAME
)
2554 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2555 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2556 /* Value is defined in another basic block. */
2559 def_info
= vect_stmt_to_vectorize (def_info
);
2560 scalar_stmts
.safe_push (def_info
);
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE
, vect_location
,
2567 "Analyzing vectorizable constructor: %G\n",
2572 /* Collect reduction statements. */
2573 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2574 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2575 scalar_stmts
.safe_push (next_info
);
2578 /* Build the tree for the SLP instance. */
2579 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2580 kind
== slp_inst_kind_ctor
2583 bst_map
, stmt_info
);
2585 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2586 where we should do store group splitting. */
2591 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2592 trees of packed scalar stmts if SLP is possible. */
2595 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2598 stmt_vec_info first_element
;
2600 DUMP_VECT_SCOPE ("vect_analyze_slp");
2602 scalar_stmts_to_slp_tree_map_t
*bst_map
2603 = new scalar_stmts_to_slp_tree_map_t ();
2605 /* Find SLP sequences starting from groups of grouped stores. */
2606 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2607 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2609 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2611 if (loop_vinfo
->reduction_chains
.length () > 0)
2613 /* Find SLP sequences starting from reduction chains. */
2614 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2615 if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2618 /* Dissolve reduction chain group. */
2619 stmt_vec_info vinfo
= first_element
;
2620 stmt_vec_info last
= NULL
;
2623 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2624 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2625 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2629 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2630 /* It can be still vectorized as part of an SLP reduction. */
2631 loop_vinfo
->reductions
.safe_push (last
);
2635 /* Find SLP sequences starting from groups of reductions. */
2636 if (loop_vinfo
->reductions
.length () > 1)
2637 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2641 /* The map keeps a reference on SLP nodes built, release that. */
2642 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2643 it
!= bst_map
->end (); ++it
)
2645 vect_free_slp_tree ((*it
).second
);
2648 return opt_result::success ();
2651 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2654 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2655 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2660 if (visited
.add (node
))
2663 node
->vertex
= vertices
.length ();
2664 vertices
.safe_push (node
);
2667 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2671 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2674 leafs
.safe_push (node
->vertex
);
2677 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2680 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2683 hash_set
<slp_tree
> visited
;
2685 slp_instance instance
;
2686 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2687 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2691 /* Apply (reverse) bijectite PERM to VEC. */
2695 vect_slp_permute (vec
<unsigned> perm
,
2696 vec
<T
> &vec
, bool reverse
)
2698 auto_vec
<T
, 64> saved
;
2699 saved
.create (vec
.length ());
2700 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2701 saved
.quick_push (vec
[i
]);
2705 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2706 vec
[perm
[i
]] = saved
[i
];
2707 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2708 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2712 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2713 vec
[i
] = saved
[perm
[i
]];
2714 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2715 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2719 /* Return whether permutations PERM_A and PERM_B as recorded in the
2720 PERMS vector are equal. */
2723 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2724 int perm_a
, int perm_b
)
2726 return (perm_a
== perm_b
2727 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2728 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2729 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2732 /* Optimize the SLP graph of VINFO. */
2735 vect_optimize_slp (vec_info
*vinfo
)
2737 if (vinfo
->slp_instances
.is_empty ())
2742 auto_vec
<slp_tree
> vertices
;
2743 auto_vec
<int> leafs
;
2744 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
2746 struct graph
*slpg
= new_graph (vertices
.length ());
2747 FOR_EACH_VEC_ELT (vertices
, i
, node
)
2751 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2753 add_edge (slpg
, i
, child
->vertex
);
2756 /* Compute (reverse) postorder on the inverted graph. */
2758 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
2760 auto_sbitmap
n_visited (vertices
.length ());
2761 auto_sbitmap
n_materialize (vertices
.length ());
2762 auto_vec
<int> n_perm (vertices
.length ());
2763 auto_vec
<vec
<unsigned> > perms
;
2765 bitmap_clear (n_visited
);
2766 bitmap_clear (n_materialize
);
2767 n_perm
.quick_grow_cleared (vertices
.length ());
2768 perms
.safe_push (vNULL
); /* zero is no permute */
2770 /* Produce initial permutations. */
2771 for (i
= 0; i
< leafs
.length (); ++i
)
2774 slp_tree node
= vertices
[idx
];
2776 /* Handle externals and constants optimistically throughout the
2778 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
2779 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
2782 /* Loads are the only thing generating permutes and leafs do not
2783 change across iterations. */
2784 bitmap_set_bit (n_visited
, idx
);
2785 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2788 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2789 node unpermuted, record this permute. */
2790 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
2791 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
2793 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
2794 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
2795 bool any_permute
= false;
2796 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2798 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
2799 imin
= MIN (imin
, idx
);
2800 imax
= MAX (imax
, idx
);
2801 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
2804 /* If there's no permute no need to split one out. */
2807 /* If the span doesn't match we'd disrupt VF computation, avoid
2809 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
2812 /* For now only handle true permutes, like
2813 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2814 when permuting constants and invariants keeping the permute
2816 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
2817 bitmap_clear (load_index
);
2818 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2819 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
2821 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2822 if (!bitmap_bit_p (load_index
, j
))
2824 if (j
!= SLP_TREE_LANES (node
))
2827 vec
<unsigned> perm
= vNULL
;
2828 perm
.safe_grow (SLP_TREE_LANES (node
), true);
2829 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2830 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
2831 perms
.safe_push (perm
);
2832 n_perm
[idx
] = perms
.length () - 1;
2835 /* Propagate permutes along the graph and compute materialization points. */
2837 unsigned iteration
= 0;
2843 for (i
= vertices
.length (); i
> 0 ; --i
)
2846 slp_tree node
= vertices
[idx
];
2847 /* For leafs there's nothing to do - we've seeded permutes
2849 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2852 bitmap_set_bit (n_visited
, idx
);
2854 /* We cannot move a permute across a store. */
2855 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2857 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
2861 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
2862 succ
; succ
= succ
->succ_next
)
2864 int succ_idx
= succ
->dest
;
2865 /* Handle unvisited nodes optimistically. */
2866 /* ??? But for constants once we want to handle non-bijective
2867 permutes we have to verify the permute, when unifying lanes,
2868 will not unify different constants. For example see
2869 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2870 if (!bitmap_bit_p (n_visited
, succ_idx
))
2872 int succ_perm
= n_perm
[succ_idx
];
2873 /* Once we materialize succs permutation its output lanes
2874 appear unpermuted to us. */
2875 if (bitmap_bit_p (n_materialize
, succ_idx
))
2879 else if (succ_perm
== 0)
2884 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
2892 /* Pick up pre-computed leaf values. */
2894 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
2897 /* Make sure we eventually converge. */
2898 gcc_checking_assert (perm
== 0);
2901 bitmap_clear_bit (n_materialize
, idx
);
2908 /* Elide pruning at materialization points in the first
2909 iteration so every node was visited once at least. */
2913 /* Decide on permute materialization. Look whether there's
2914 a use (pred) edge that is permuted differently than us.
2915 In that case mark ourselves so the permutation is applied. */
2916 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
2917 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
2918 pred
; pred
= pred
->pred_next
)
2920 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
2921 int pred_perm
= n_perm
[pred
->src
];
2922 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
2924 all_preds_permuted
= false;
2928 if (!all_preds_permuted
)
2930 if (!bitmap_bit_p (n_materialize
, idx
))
2932 bitmap_set_bit (n_materialize
, idx
);
2936 while (changed
|| iteration
== 1);
2939 for (i
= 0; i
< vertices
.length (); ++i
)
2941 int perm
= n_perm
[i
];
2945 slp_tree node
= vertices
[i
];
2947 /* First permute invariant/external original successors. */
2950 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2952 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2955 /* If the vector is uniform there's nothing to do. */
2956 if (vect_slp_tree_uniform_p (child
))
2959 /* We can end up sharing some externals via two_operator
2960 handling. Be prepared to unshare those. */
2961 if (child
->refcnt
!= 1)
2963 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
2964 SLP_TREE_CHILDREN (node
)[j
] = child
2965 = vect_create_new_slp_node
2966 (SLP_TREE_SCALAR_OPS (child
).copy ());
2968 vect_slp_permute (perms
[perm
],
2969 SLP_TREE_SCALAR_OPS (child
), true);
2972 if (bitmap_bit_p (n_materialize
, i
))
2974 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2975 /* For loads simply drop the permutation, the load permutation
2976 already performs the desired permutation. */
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE
, vect_location
,
2982 "inserting permute node in place of %p\n",
2985 /* Make a copy of NODE and in-place change it to a
2986 VEC_PERM node to permute the lanes of the copy. */
2987 slp_tree copy
= new _slp_tree
;
2988 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
2989 SLP_TREE_CHILDREN (node
) = vNULL
;
2990 SLP_TREE_SCALAR_STMTS (copy
)
2991 = SLP_TREE_SCALAR_STMTS (node
).copy ();
2992 vect_slp_permute (perms
[perm
],
2993 SLP_TREE_SCALAR_STMTS (copy
), true);
2994 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
2995 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
2996 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
2997 SLP_TREE_LANE_PERMUTATION (copy
)
2998 = SLP_TREE_LANE_PERMUTATION (node
);
2999 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3000 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3002 copy
->max_nunits
= node
->max_nunits
;
3003 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3004 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3005 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3007 /* Now turn NODE into a VEC_PERM. */
3008 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3009 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3010 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3011 SLP_TREE_LANE_PERMUTATION (node
)
3012 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3013 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3018 /* Apply the reverse permutation to our stmts. */
3019 vect_slp_permute (perms
[perm
],
3020 SLP_TREE_SCALAR_STMTS (node
), true);
3021 /* And to the load permutation, which we can simply
3022 make regular by design. */
3023 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3025 /* ??? When we handle non-bijective permutes the idea
3026 is that we can force the load-permutation to be
3027 { min, min + 1, min + 2, ... max }. But then the
3028 scalar defs might no longer match the lane content
3029 which means wrong-code with live lane vectorization.
3030 So we possibly have to have NULL entries for those. */
3031 vect_slp_permute (perms
[perm
],
3032 SLP_TREE_LOAD_PERMUTATION (node
), true);
3037 /* Free the perms vector used for propagation. */
3038 while (!perms
.is_empty ())
3039 perms
.pop ().release ();
3043 /* Now elide load permutations that are not necessary. */
3044 for (i
= 0; i
< leafs
.length (); ++i
)
3047 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3050 /* In basic block vectorization we allow any subchain of an interleaving
3052 FORNOW: not in loop SLP because of realignment complications. */
3053 if (is_a
<bb_vec_info
> (vinfo
))
3055 bool subchain_p
= true;
3056 stmt_vec_info next_load_info
= NULL
;
3057 stmt_vec_info load_info
;
3059 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3062 && (next_load_info
!= load_info
3063 || DR_GROUP_GAP (load_info
) != 1))
3068 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3072 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3078 stmt_vec_info load_info
;
3079 bool this_load_permuted
= false;
3081 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3082 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3084 this_load_permuted
= true;
3087 stmt_vec_info first_stmt_info
3088 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3089 if (!this_load_permuted
3090 /* The load requires permutation when unrolling exposes
3091 a gap either because the group is larger than the SLP
3092 group-size or because there is a gap between the groups. */
3093 && (known_eq (LOOP_VINFO_VECT_FACTOR
3094 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3095 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3096 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3098 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3106 /* For each possible SLP instance decide whether to SLP it and calculate overall
3107 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3108 least one instance. */
3111 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3114 poly_uint64 unrolling_factor
= 1;
3115 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3116 slp_instance instance
;
3117 int decided_to_slp
= 0;
3119 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3121 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3123 /* FORNOW: SLP if you can. */
3124 /* All unroll factors have the form:
3126 GET_MODE_SIZE (vinfo->vector_mode) * X
3128 for some rational X, so they must have a common multiple. */
3130 = force_common_multiple (unrolling_factor
,
3131 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3133 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3134 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3135 loop-based vectorization. Such stmts will be marked as HYBRID. */
3136 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3140 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3142 if (decided_to_slp
&& dump_enabled_p ())
3144 dump_printf_loc (MSG_NOTE
, vect_location
,
3145 "Decided to SLP %d instances. Unrolling factor ",
3147 dump_dec (MSG_NOTE
, unrolling_factor
);
3148 dump_printf (MSG_NOTE
, "\n");
3151 return (decided_to_slp
> 0);
3154 /* Private data for vect_detect_hybrid_slp. */
3157 loop_vec_info loop_vinfo
;
3158 vec
<stmt_vec_info
> *worklist
;
3161 /* Walker for walk_gimple_op. */
3164 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3166 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3167 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3172 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3175 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3176 if (PURE_SLP_STMT (def_stmt_info
))
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3180 def_stmt_info
->stmt
);
3181 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3182 dat
->worklist
->safe_push (def_stmt_info
);
3188 /* Find stmts that must be both vectorized and SLPed. */
3191 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3193 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3195 /* All stmts participating in SLP are marked pure_slp, all other
3196 stmts are loop_vect.
3197 First collect all loop_vect stmts into a worklist. */
3198 auto_vec
<stmt_vec_info
> worklist
;
3199 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
3201 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3202 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3205 gphi
*phi
= gsi
.phi ();
3206 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3207 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3208 worklist
.safe_push (stmt_info
);
3210 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
3213 gimple
*stmt
= gsi_stmt (gsi
);
3214 if (is_gimple_debug (stmt
))
3216 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3217 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3219 for (gimple_stmt_iterator gsi2
3220 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3221 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3223 stmt_vec_info patt_info
3224 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3225 if (!STMT_SLP_TYPE (patt_info
)
3226 && STMT_VINFO_RELEVANT (patt_info
))
3227 worklist
.safe_push (patt_info
);
3229 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3231 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3232 worklist
.safe_push (stmt_info
);
3236 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3237 mark any SLP vectorized stmt as hybrid.
3238 ??? We're visiting def stmts N times (once for each non-SLP and
3239 once for each hybrid-SLP use). */
3242 dat
.worklist
= &worklist
;
3243 dat
.loop_vinfo
= loop_vinfo
;
3244 memset (&wi
, 0, sizeof (wi
));
3245 wi
.info
= (void *)&dat
;
3246 while (!worklist
.is_empty ())
3248 stmt_vec_info stmt_info
= worklist
.pop ();
3249 /* Since SSA operands are not set up for pattern stmts we need
3250 to use walk_gimple_op. */
3252 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3257 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3259 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3260 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
)
3262 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3265 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3268 gphi
*phi
= si
.phi ();
3269 gimple_set_uid (phi
, 0);
3272 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3273 !gsi_end_p (gsi
); gsi_next (&gsi
))
3275 gimple
*stmt
= gsi_stmt (gsi
);
3276 gimple_set_uid (stmt
, 0);
3277 if (is_gimple_debug (stmt
))
3285 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3286 stmts in the basic block. */
3288 _bb_vec_info::~_bb_vec_info ()
3290 /* Reset region marker. */
3291 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3294 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3297 gphi
*phi
= si
.phi ();
3298 gimple_set_uid (phi
, -1);
3300 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3301 !gsi_end_p (gsi
); gsi_next (&gsi
))
3303 gimple
*stmt
= gsi_stmt (gsi
);
3304 gimple_set_uid (stmt
, -1);
3309 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3310 given then that child nodes have already been processed, and that
3311 their def types currently match their SLP node's def type. */
3314 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3315 slp_instance node_instance
,
3316 stmt_vector_for_cost
*cost_vec
)
3318 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3319 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3321 /* Calculate the number of vector statements to be created for the
3322 scalar stmts in this node. For SLP reductions it is equal to the
3323 number of vector statements in the children (which has already been
3324 calculated by the recursive call). Otherwise it is the number of
3325 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3326 VF divided by the number of elements in a vector. */
3327 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3328 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3330 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3331 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3333 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3334 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3341 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3342 vf
= loop_vinfo
->vectorization_factor
;
3345 unsigned int group_size
= SLP_TREE_LANES (node
);
3346 tree vectype
= SLP_TREE_VECTYPE (node
);
3347 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3348 = vect_get_num_vectors (vf
* group_size
, vectype
);
3351 /* Handle purely internal nodes. */
3352 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3353 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3355 if (is_a
<bb_vec_info
> (vinfo
)
3356 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3360 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3361 node
, node_instance
, cost_vec
);
3364 /* Try to build NODE from scalars, returning true on success.
3365 NODE_INSTANCE is the SLP instance that contains NODE. */
3368 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3369 slp_instance node_instance
)
3371 stmt_vec_info stmt_info
;
3374 if (!is_a
<bb_vec_info
> (vinfo
)
3375 || node
== SLP_INSTANCE_TREE (node_instance
)
3376 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3377 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3380 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_NOTE
, vect_location
,
3382 "Building vector operands from scalars instead\n");
3384 /* Don't remove and free the child nodes here, since they could be
3385 referenced by other structures. The analysis and scheduling phases
3386 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3387 unsigned int group_size
= SLP_TREE_LANES (node
);
3388 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3389 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3390 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3391 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3393 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3394 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3399 /* Compute the prologue cost for invariant or constant operands represented
3403 vect_prologue_cost_for_slp (slp_tree node
,
3404 stmt_vector_for_cost
*cost_vec
)
3406 /* There's a special case of an existing vector, that costs nothing. */
3407 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3408 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3410 /* Without looking at the actual initializer a vector of
3411 constants can be implemented as load from the constant pool.
3412 When all elements are the same we can use a splat. */
3413 tree vectype
= SLP_TREE_VECTYPE (node
);
3414 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3415 unsigned num_vects_to_check
;
3416 unsigned HOST_WIDE_INT const_nunits
;
3417 unsigned nelt_limit
;
3418 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3419 && ! multiple_p (const_nunits
, group_size
))
3421 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3422 nelt_limit
= const_nunits
;
3426 /* If either the vector has variable length or the vectors
3427 are composed of repeated whole groups we only need to
3428 cost construction once. All vectors will be the same. */
3429 num_vects_to_check
= 1;
3430 nelt_limit
= group_size
;
3432 tree elt
= NULL_TREE
;
3434 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3436 unsigned si
= j
% group_size
;
3438 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3439 /* ??? We're just tracking whether all operands of a single
3440 vector initializer are the same, ideally we'd check if
3441 we emitted the same one already. */
3442 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3445 if (nelt
== nelt_limit
)
3447 record_stmt_cost (cost_vec
, 1,
3448 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3449 ? (elt
? scalar_to_vec
: vec_construct
)
3451 NULL
, vectype
, 0, vect_prologue
);
3457 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3458 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3460 Return true if the operations are supported. */
3463 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3464 slp_instance node_instance
,
3465 hash_set
<slp_tree
> &visited
,
3466 hash_set
<slp_tree
> &lvisited
,
3467 stmt_vector_for_cost
*cost_vec
)
3472 /* Assume we can code-generate all invariants. */
3474 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3475 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3478 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3480 if (dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE
, vect_location
,
3482 "Failed cyclic SLP reference in %p", node
);
3485 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3487 /* If we already analyzed the exact same set of scalar stmts we're done.
3488 We share the generated vector stmts for those. */
3489 if (visited
.contains (node
)
3490 || lvisited
.add (node
))
3494 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3496 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3497 visited
, lvisited
, cost_vec
);
3503 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3506 lvisited
.remove (node
);
3508 /* When the node can be vectorized cost invariant nodes it references.
3509 This is not done in DFS order to allow the refering node
3510 vectorizable_* calls to nail down the invariant nodes vector type
3511 and possibly unshare it if it needs a different vector type than
3514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3516 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3517 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3518 /* Perform usual caching, note code-generation still
3519 code-gens these nodes multiple times but we expect
3520 to CSE them later. */
3521 && !visited
.contains (child
)
3522 && !lvisited
.add (child
))
3524 /* ??? After auditing more code paths make a "default"
3525 and push the vector type from NODE to all children
3526 if it is not already set. */
3527 /* Compute the number of vectors to be generated. */
3528 tree vector_type
= SLP_TREE_VECTYPE (child
);
3531 /* For shifts with a scalar argument we don't need
3532 to cost or code-generate anything.
3533 ??? Represent this more explicitely. */
3534 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3535 == shift_vec_info_type
)
3539 unsigned group_size
= SLP_TREE_LANES (child
);
3541 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3542 vf
= loop_vinfo
->vectorization_factor
;
3543 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3544 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3545 /* And cost them. */
3546 vect_prologue_cost_for_slp (child
, cost_vec
);
3549 /* If this node or any of its children can't be vectorized, try pruning
3550 the tree here rather than felling the whole thing. */
3551 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3553 /* We'll need to revisit this for invariant costing and number
3554 of vectorized stmt setting. */
3562 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3563 region and that can be vectorized using vectorizable_live_operation
3564 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3565 scalar code computing it to be retained. */
3568 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3569 slp_instance instance
,
3570 stmt_vector_for_cost
*cost_vec
,
3571 hash_set
<stmt_vec_info
> &svisited
,
3572 hash_set
<slp_tree
> &visited
)
3574 if (visited
.add (node
))
3578 stmt_vec_info stmt_info
;
3579 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3580 bool all_visited
= true;
3581 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3583 if (svisited
.contains (stmt_info
))
3585 all_visited
= false;
3586 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3587 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3588 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3589 /* Only the pattern root stmt computes the original scalar value. */
3591 bool mark_visited
= true;
3592 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3593 ssa_op_iter op_iter
;
3594 def_operand_p def_p
;
3595 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3597 imm_use_iterator use_iter
;
3599 stmt_vec_info use_stmt_info
;
3600 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3601 if (!is_gimple_debug (use_stmt
))
3603 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3605 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3607 STMT_VINFO_LIVE_P (stmt_info
) = true;
3608 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3609 NULL
, node
, instance
, i
,
3611 /* ??? So we know we can vectorize the live stmt
3612 from one SLP node. If we cannot do so from all
3613 or none consistently we'd have to record which
3614 SLP node (and lane) we want to use for the live
3615 operation. So make sure we can code-generate
3617 mark_visited
= false;
3619 STMT_VINFO_LIVE_P (stmt_info
) = false;
3620 BREAK_FROM_IMM_USE_STMT (use_iter
);
3623 /* We have to verify whether we can insert the lane extract
3624 before all uses. The following is a conservative approximation.
3625 We cannot put this into vectorizable_live_operation because
3626 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3628 Note that while the fact that we emit code for loads at the
3629 first load should make this a non-problem leafs we construct
3630 from scalars are vectorized after the last scalar def.
3631 ??? If we'd actually compute the insert location during
3632 analysis we could use sth less conservative than the last
3633 scalar stmt in the node for the dominance check. */
3634 /* ??? What remains is "live" uses in vector CTORs in the same
3635 SLP graph which is where those uses can end up code-generated
3636 right after their definition instead of close to their original
3637 use. But that would restrict us to code-generate lane-extracts
3638 from the latest stmt in a node. So we compensate for this
3639 during code-generation, simply not replacing uses for those
3640 hopefully rare cases. */
3641 if (STMT_VINFO_LIVE_P (stmt_info
))
3642 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3643 if (!is_gimple_debug (use_stmt
)
3644 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3645 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3646 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3650 "Cannot determine insertion place for "
3652 STMT_VINFO_LIVE_P (stmt_info
) = false;
3653 mark_visited
= true;
3657 svisited
.add (stmt_info
);
3663 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3664 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3665 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3666 cost_vec
, svisited
, visited
);
3669 /* Analyze statements in SLP instances of VINFO. Return true if the
3670 operations are supported. */
3673 vect_slp_analyze_operations (vec_info
*vinfo
)
3675 slp_instance instance
;
3678 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3680 hash_set
<slp_tree
> visited
;
3681 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3683 hash_set
<slp_tree
> lvisited
;
3684 stmt_vector_for_cost cost_vec
;
3685 cost_vec
.create (2);
3686 if (is_a
<bb_vec_info
> (vinfo
))
3687 vect_location
= instance
->location ();
3688 if (!vect_slp_analyze_node_operations (vinfo
,
3689 SLP_INSTANCE_TREE (instance
),
3690 instance
, visited
, lvisited
,
3692 /* Instances with a root stmt require vectorized defs for the
3694 || (SLP_INSTANCE_ROOT_STMT (instance
)
3695 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3696 != vect_internal_def
)))
3698 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3699 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE
, vect_location
,
3702 "removing SLP instance operations starting from: %G",
3704 vect_free_slp_instance (instance
);
3705 vinfo
->slp_instances
.ordered_remove (i
);
3706 cost_vec
.release ();
3710 for (hash_set
<slp_tree
>::iterator x
= lvisited
.begin();
3711 x
!= lvisited
.end(); ++x
)
3715 /* For BB vectorization remember the SLP graph entry
3717 if (is_a
<bb_vec_info
> (vinfo
))
3718 instance
->cost_vec
= cost_vec
;
3721 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3722 cost_vec
.release ();
3727 /* Compute vectorizable live stmts. */
3728 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3730 hash_set
<stmt_vec_info
> svisited
;
3731 hash_set
<slp_tree
> visited
;
3732 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3734 vect_location
= instance
->location ();
3735 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3736 instance
, &instance
->cost_vec
, svisited
,
3741 return !vinfo
->slp_instances
.is_empty ();
3744 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3745 closing the eventual chain. */
3748 get_ultimate_leader (slp_instance instance
,
3749 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3751 auto_vec
<slp_instance
*, 8> chain
;
3753 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3755 chain
.safe_push (tem
);
3758 while (!chain
.is_empty ())
3759 *chain
.pop () = instance
;
3763 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3766 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3767 slp_instance instance
, slp_tree node
,
3768 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3769 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
3770 hash_set
<slp_tree
> &visited
)
3772 stmt_vec_info stmt_info
;
3775 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3778 slp_instance
&stmt_instance
3779 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3782 else if (stmt_instance
!= instance
)
3784 /* If we're running into a previously marked stmt make us the
3785 leader of the current ultimate leader. This keeps the
3786 leader chain acyclic and works even when the current instance
3787 connects two previously independent graph parts. */
3788 slp_instance stmt_leader
3789 = get_ultimate_leader (stmt_instance
, instance_leader
);
3790 if (stmt_leader
!= instance
)
3791 instance_leader
.put (stmt_leader
, instance
);
3793 stmt_instance
= instance
;
3796 if (visited
.add (node
))
3800 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3801 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3802 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3803 instance_leader
, visited
);
3806 /* Partition the SLP graph into pieces that can be costed independently. */
3809 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3811 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3813 /* First walk the SLP graph assigning each involved scalar stmt a
3814 corresponding SLP graph entry and upon visiting a previously
3815 marked stmt, make the stmts leader the current SLP graph entry. */
3816 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3817 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3818 hash_set
<slp_tree
> visited
;
3819 slp_instance instance
;
3820 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3822 instance_leader
.put (instance
, instance
);
3823 vect_bb_partition_graph_r (bb_vinfo
,
3824 instance
, SLP_INSTANCE_TREE (instance
),
3825 stmt_to_instance
, instance_leader
,
3829 /* Then collect entries to each independent subgraph. */
3830 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3832 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3833 leader
->subgraph_entries
.safe_push (instance
);
3834 if (dump_enabled_p ()
3835 && leader
!= instance
)
3836 dump_printf_loc (MSG_NOTE
, vect_location
,
3837 "instance %p is leader of %p\n",
3842 /* Compute the scalar cost of the SLP node NODE and its children
3843 and return it. Do not account defs that are marked in LIFE and
3844 update LIFE according to uses of NODE. */
3847 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3848 slp_tree node
, vec
<bool, va_heap
> *life
,
3849 stmt_vector_for_cost
*cost_vec
,
3850 hash_set
<slp_tree
> &visited
)
3853 stmt_vec_info stmt_info
;
3856 if (visited
.add (node
))
3859 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3861 ssa_op_iter op_iter
;
3862 def_operand_p def_p
;
3867 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3868 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3870 /* If there is a non-vectorized use of the defs then the scalar
3871 stmt is kept live in which case we do not account it or any
3872 required defs in the SLP children in the scalar cost. This
3873 way we make the vectorization more costly when compared to
3875 if (!STMT_VINFO_LIVE_P (stmt_info
))
3877 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3879 imm_use_iterator use_iter
;
3881 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3882 if (!is_gimple_debug (use_stmt
))
3884 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
3887 (vect_stmt_to_vectorize (use_stmt_info
)))
3890 BREAK_FROM_IMM_USE_STMT (use_iter
);
3898 /* Count scalar stmts only once. */
3899 if (gimple_visited_p (orig_stmt
))
3901 gimple_set_visited (orig_stmt
, true);
3903 vect_cost_for_stmt kind
;
3904 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
3906 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
3909 kind
= scalar_store
;
3911 else if (vect_nop_conversion_p (orig_stmt_info
))
3915 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
, 0, vect_body
);
3918 auto_vec
<bool, 20> subtree_life
;
3919 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3921 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3923 /* Do not directly pass LIFE to the recursive call, copy it to
3924 confine changes in the callee to the current child/subtree. */
3925 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3927 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
3928 for (unsigned j
= 0;
3929 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3931 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3932 if (perm
.first
== i
)
3933 subtree_life
[perm
.second
] = (*life
)[j
];
3938 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3939 subtree_life
.safe_splice (*life
);
3941 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
3943 subtree_life
.truncate (0);
3948 /* Check if vectorization of the basic block is profitable for the
3949 subgraph denoted by SLP_INSTANCES. */
3952 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
3953 vec
<slp_instance
> slp_instances
)
3955 slp_instance instance
;
3957 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3958 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3960 void *vect_target_cost_data
= init_cost (NULL
);
3962 /* Calculate scalar cost and sum the cost for the vector stmts
3963 previously collected. */
3964 stmt_vector_for_cost scalar_costs
;
3965 scalar_costs
.create (0);
3966 hash_set
<slp_tree
> visited
;
3967 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3969 auto_vec
<bool, 20> life
;
3970 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
3972 vect_bb_slp_scalar_cost (bb_vinfo
,
3973 SLP_INSTANCE_TREE (instance
),
3974 &life
, &scalar_costs
, visited
);
3975 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
3976 instance
->cost_vec
.release ();
3978 /* Unset visited flag. */
3979 stmt_info_for_cost
*si
;
3980 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
3981 gimple_set_visited (si
->stmt_info
->stmt
, false);
3983 void *scalar_target_cost_data
= init_cost (NULL
);
3984 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
3985 scalar_costs
.release ();
3987 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
3988 destroy_cost_data (scalar_target_cost_data
);
3990 /* Complete the target-specific vector cost calculation. */
3991 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
3992 &vec_inside_cost
, &vec_epilogue_cost
);
3993 destroy_cost_data (vect_target_cost_data
);
3995 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
3997 if (dump_enabled_p ())
3999 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4000 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
4002 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
4003 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
4004 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
4007 /* Vectorization is profitable if its cost is more than the cost of scalar
4008 version. Note that we err on the vector side for equal cost because
4009 the cost estimate is otherwise quite pessimistic (constant uses are
4010 free on the scalar side but cost a load on the vector side for
4012 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4018 /* Find any vectorizable constructors and add them to the grouped_store
4022 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4024 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4025 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4026 !gsi_end_p (gsi
); gsi_next (&gsi
))
4028 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4029 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
4032 tree rhs
= gimple_assign_rhs1 (assign
);
4033 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4034 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4035 CONSTRUCTOR_NELTS (rhs
))
4036 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4037 || uniform_vector_p (rhs
))
4040 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4041 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4045 /* Check if the region described by BB_VINFO can be vectorized, returning
4046 true if so. When returning false, set FATAL to true if the same failure
4047 would prevent vectorization at other vector sizes, false if it is still
4048 worth trying other sizes. N_STMTS is the number of statements in the
4052 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4053 vec
<int> *dataref_groups
)
4055 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4057 slp_instance instance
;
4059 poly_uint64 min_vf
= 2;
4061 /* The first group of checks is independent of the vector size. */
4064 /* Analyze the data references. */
4066 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4068 if (dump_enabled_p ())
4069 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4070 "not vectorized: unhandled data-ref in basic "
4075 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4077 if (dump_enabled_p ())
4078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4079 "not vectorized: unhandled data access in "
4084 vect_slp_check_for_constructors (bb_vinfo
);
4086 /* If there are no grouped stores and no constructors in the region
4087 there is no need to continue with pattern recog as vect_analyze_slp
4088 will fail anyway. */
4089 if (bb_vinfo
->grouped_stores
.is_empty ())
4091 if (dump_enabled_p ())
4092 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4093 "not vectorized: no grouped stores in "
4098 /* While the rest of the analysis below depends on it in some way. */
4101 vect_pattern_recog (bb_vinfo
);
4103 /* Check the SLP opportunities in the basic block, analyze and build SLP
4105 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4107 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4110 "Failed to SLP the basic block.\n");
4111 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4112 "not vectorized: failed to find SLP opportunities "
4113 "in basic block.\n");
4118 /* Optimize permutations. */
4119 vect_optimize_slp (bb_vinfo
);
4121 vect_record_base_alignments (bb_vinfo
);
4123 /* Analyze and verify the alignment of data references and the
4124 dependence in the SLP instances. */
4125 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4127 vect_location
= instance
->location ();
4128 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4129 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4131 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4132 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4133 if (dump_enabled_p ())
4134 dump_printf_loc (MSG_NOTE
, vect_location
,
4135 "removing SLP instance operations starting from: %G",
4137 vect_free_slp_instance (instance
);
4138 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4142 /* Mark all the statements that we want to vectorize as pure SLP and
4144 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4145 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4146 if (SLP_INSTANCE_ROOT_STMT (instance
))
4147 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
4151 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4154 if (!vect_slp_analyze_operations (bb_vinfo
))
4156 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4158 "not vectorized: bad operation in basic block.\n");
4162 vect_bb_partition_graph (bb_vinfo
);
4167 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4168 basic blocks in BBS, returning true on success.
4169 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4172 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4173 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4175 bb_vec_info bb_vinfo
;
4176 auto_vector_modes vector_modes
;
4178 /* Autodetect first vector size we try. */
4179 machine_mode next_vector_mode
= VOIDmode
;
4180 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4181 unsigned int mode_i
= 0;
4183 vec_info_shared shared
;
4185 machine_mode autodetected_vector_mode
= VOIDmode
;
4188 bool vectorized
= false;
4190 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4192 bool first_time_p
= shared
.datarefs
.is_empty ();
4193 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4195 bb_vinfo
->shared
->save_datarefs ();
4197 bb_vinfo
->shared
->check_datarefs ();
4198 bb_vinfo
->vector_mode
= next_vector_mode
;
4200 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
4201 && dbg_cnt (vect_slp
))
4203 if (dump_enabled_p ())
4205 dump_printf_loc (MSG_NOTE
, vect_location
,
4206 "***** Analysis succeeded with vector mode"
4207 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4208 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4211 bb_vinfo
->shared
->check_datarefs ();
4214 slp_instance instance
;
4215 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4217 if (instance
->subgraph_entries
.is_empty ())
4220 vect_location
= instance
->location ();
4221 if (!unlimited_cost_model (NULL
)
4222 && !vect_bb_vectorization_profitable_p
4223 (bb_vinfo
, instance
->subgraph_entries
))
4225 if (dump_enabled_p ())
4226 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4227 "not vectorized: vectorization is not "
4232 if (!vectorized
&& dump_enabled_p ())
4233 dump_printf_loc (MSG_NOTE
, vect_location
,
4234 "Basic block will be vectorized "
4238 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4240 unsigned HOST_WIDE_INT bytes
;
4241 if (dump_enabled_p ())
4244 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4245 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4246 "basic block part vectorized using %wu "
4247 "byte vectors\n", bytes
);
4249 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4250 "basic block part vectorized using "
4251 "variable length vectors\n");
4257 if (dump_enabled_p ())
4258 dump_printf_loc (MSG_NOTE
, vect_location
,
4259 "***** Analysis failed with vector mode %s\n",
4260 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4264 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4267 while (mode_i
< vector_modes
.length ()
4268 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4270 if (dump_enabled_p ())
4271 dump_printf_loc (MSG_NOTE
, vect_location
,
4272 "***** The result for vector mode %s would"
4274 GET_MODE_NAME (vector_modes
[mode_i
]));
4280 if (mode_i
< vector_modes
.length ()
4281 && VECTOR_MODE_P (autodetected_vector_mode
)
4282 && (related_vector_mode (vector_modes
[mode_i
],
4283 GET_MODE_INNER (autodetected_vector_mode
))
4284 == autodetected_vector_mode
)
4285 && (related_vector_mode (autodetected_vector_mode
,
4286 GET_MODE_INNER (vector_modes
[mode_i
]))
4287 == vector_modes
[mode_i
]))
4289 if (dump_enabled_p ())
4290 dump_printf_loc (MSG_NOTE
, vect_location
,
4291 "***** Skipping vector mode %s, which would"
4292 " repeat the analysis for %s\n",
4293 GET_MODE_NAME (vector_modes
[mode_i
]),
4294 GET_MODE_NAME (autodetected_vector_mode
));
4299 || mode_i
== vector_modes
.length ()
4300 || autodetected_vector_mode
== VOIDmode
4301 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4302 vector sizes will fail do not bother iterating. */
4306 /* Try the next biggest vector size. */
4307 next_vector_mode
= vector_modes
[mode_i
++];
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_NOTE
, vect_location
,
4310 "***** Re-trying analysis with vector mode %s\n",
4311 GET_MODE_NAME (next_vector_mode
));
4316 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4317 true if anything in the basic-block was vectorized. */
4320 vect_slp_bbs (vec
<basic_block
> bbs
)
4322 vec
<data_reference_p
> datarefs
= vNULL
;
4323 auto_vec
<int> dataref_groups
;
4325 int current_group
= 0;
4327 for (unsigned i
= 0; i
< bbs
.length (); i
++)
4329 basic_block bb
= bbs
[i
];
4330 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
4333 gimple
*stmt
= gsi_stmt (gsi
);
4334 if (is_gimple_debug (stmt
))
4339 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
4340 vect_location
= stmt
;
4342 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
4343 &dataref_groups
, current_group
))
4348 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
4351 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4352 true if anything in the basic-block was vectorized. */
4355 vect_slp_bb (basic_block bb
)
4357 auto_vec
<basic_block
> bbs
;
4359 return vect_slp_bbs (bbs
);
4362 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4363 true if anything in the basic-block was vectorized. */
4366 vect_slp_function (function
*fun
)
4369 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
4370 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
4372 /* For the moment split the function into pieces to avoid making
4373 the iteration on the vector mode moot. Split at points we know
4374 to not handle well which is CFG merges (SLP discovery doesn't
4375 handle non-loop-header PHIs) and loop exits. Since pattern
4376 recog requires reverse iteration to visit uses before defs
4377 simply chop RPO into pieces. */
4378 auto_vec
<basic_block
> bbs
;
4379 for (unsigned i
= 0; i
< n
; i
++)
4381 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
4384 /* Split when a BB is not dominated by the first block. */
4385 if (!bbs
.is_empty ()
4386 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
4388 if (dump_enabled_p ())
4389 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4390 "splitting region at dominance boundary bb%d\n",
4394 /* Split when the loop determined by the first block
4395 is exited. This is because we eventually insert
4396 invariants at region begin. */
4397 else if (!bbs
.is_empty ()
4398 && bbs
[0]->loop_father
!= bb
->loop_father
4399 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
4401 if (dump_enabled_p ())
4402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4403 "splitting region at loop %d exit at bb%d\n",
4404 bbs
[0]->loop_father
->num
, bb
->index
);
4408 if (split
&& !bbs
.is_empty ())
4410 r
|= vect_slp_bbs (bbs
);
4412 bbs
.quick_push (bb
);
4417 /* When we have a stmt ending this block and defining a
4418 value we have to insert on edges when inserting after it for
4419 a vector containing its definition. Avoid this for now. */
4420 if (gimple
*last
= last_stmt (bb
))
4421 if (gimple_get_lhs (last
)
4422 && is_ctrl_altering_stmt (last
))
4424 if (dump_enabled_p ())
4425 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4426 "splitting region at control altering "
4427 "definition %G", last
);
4428 r
|= vect_slp_bbs (bbs
);
4433 if (!bbs
.is_empty ())
4434 r
|= vect_slp_bbs (bbs
);
4441 /* Build a variable-length vector in which the elements in ELTS are repeated
4442 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4443 RESULTS and add any new instructions to SEQ.
4445 The approach we use is:
4447 (1) Find a vector mode VM with integer elements of mode IM.
4449 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4450 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4451 from small vectors to IM.
4453 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4455 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4456 correct byte contents.
4458 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4460 We try to find the largest IM for which this sequence works, in order
4461 to cut down on the number of interleaves. */
4464 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
4465 vec
<tree
> elts
, unsigned int nresults
,
4468 unsigned int nelts
= elts
.length ();
4469 tree element_type
= TREE_TYPE (vector_type
);
4471 /* (1) Find a vector mode VM with integer elements of mode IM. */
4472 unsigned int nvectors
= 1;
4473 tree new_vector_type
;
4475 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
4476 &nvectors
, &new_vector_type
,
4480 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4481 unsigned int partial_nelts
= nelts
/ nvectors
;
4482 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
4484 tree_vector_builder partial_elts
;
4485 auto_vec
<tree
, 32> pieces (nvectors
* 2);
4486 pieces
.quick_grow (nvectors
* 2);
4487 for (unsigned int i
= 0; i
< nvectors
; ++i
)
4489 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4490 ELTS' has mode IM. */
4491 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
4492 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
4493 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
4494 tree t
= gimple_build_vector (seq
, &partial_elts
);
4495 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
4496 TREE_TYPE (new_vector_type
), t
);
4498 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4499 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
4502 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4503 correct byte contents.
4505 We need to repeat the following operation log2(nvectors) times:
4507 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4508 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4510 However, if each input repeats every N elements and the VF is
4511 a multiple of N * 2, the HI result is the same as the LO. */
4512 unsigned int in_start
= 0;
4513 unsigned int out_start
= nvectors
;
4514 unsigned int hi_start
= nvectors
/ 2;
4515 /* A bound on the number of outputs needed to produce NRESULTS results
4516 in the final iteration. */
4517 unsigned int noutputs_bound
= nvectors
* nresults
;
4518 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
4520 noutputs_bound
/= 2;
4521 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
4522 for (unsigned int i
= 0; i
< limit
; ++i
)
4525 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
4528 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
4532 tree output
= make_ssa_name (new_vector_type
);
4533 tree input1
= pieces
[in_start
+ (i
/ 2)];
4534 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
4535 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
4538 gimple_seq_add_stmt (seq
, stmt
);
4539 pieces
[out_start
+ i
] = output
;
4541 std::swap (in_start
, out_start
);
4544 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4545 results
.reserve (nresults
);
4546 for (unsigned int i
= 0; i
< nresults
; ++i
)
4548 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
4549 pieces
[in_start
+ i
]));
4551 results
.quick_push (results
[i
- nvectors
]);
4555 /* For constant and loop invariant defs in OP_NODE this function creates
4556 vector defs that will be used in the vectorized stmts and stores them
4557 to SLP_TREE_VEC_DEFS of OP_NODE. */
4560 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
4562 unsigned HOST_WIDE_INT nunits
;
4564 unsigned j
, number_of_places_left_in_vector
;
4567 int group_size
= op_node
->ops
.length ();
4568 unsigned int vec_num
, i
;
4569 unsigned number_of_copies
= 1;
4571 gimple_seq ctor_seq
= NULL
;
4572 auto_vec
<tree
, 16> permute_results
;
4574 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4575 vector_type
= SLP_TREE_VECTYPE (op_node
);
4577 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
4578 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
4579 auto_vec
<tree
> voprnds (number_of_vectors
);
4581 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4582 created vectors. It is greater than 1 if unrolling is performed.
4584 For example, we have two scalar operands, s1 and s2 (e.g., group of
4585 strided accesses of size two), while NUNITS is four (i.e., four scalars
4586 of this type can be packed in a vector). The output vector will contain
4587 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4590 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4591 containing the operands.
4593 For example, NUNITS is four as before, and the group size is 8
4594 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4595 {s5, s6, s7, s8}. */
4597 /* When using duplicate_and_interleave, we just need one element for
4598 each scalar statement. */
4599 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4600 nunits
= group_size
;
4602 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4604 number_of_places_left_in_vector
= nunits
;
4606 tree_vector_builder
elts (vector_type
, nunits
, 1);
4607 elts
.quick_grow (nunits
);
4608 stmt_vec_info insert_after
= NULL
;
4609 for (j
= 0; j
< number_of_copies
; j
++)
4612 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
4614 /* Create 'vect_ = {op0,op1,...,opn}'. */
4615 number_of_places_left_in_vector
--;
4617 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
4619 if (CONSTANT_CLASS_P (op
))
4621 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4623 /* Can't use VIEW_CONVERT_EXPR for booleans because
4624 of possibly different sizes of scalar value and
4626 if (integer_zerop (op
))
4627 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
4628 else if (integer_onep (op
))
4629 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
4634 op
= fold_unary (VIEW_CONVERT_EXPR
,
4635 TREE_TYPE (vector_type
), op
);
4636 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
4640 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
4642 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4645 = build_all_ones_cst (TREE_TYPE (vector_type
));
4647 = build_zero_cst (TREE_TYPE (vector_type
));
4648 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
4649 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4655 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4658 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4661 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4665 elts
[number_of_places_left_in_vector
] = op
;
4666 if (!CONSTANT_CLASS_P (op
))
4668 /* For BB vectorization we have to compute an insert location
4669 when a def is inside the analyzed region since we cannot
4670 simply insert at the BB start in this case. */
4671 stmt_vec_info opdef
;
4672 if (TREE_CODE (orig_op
) == SSA_NAME
4673 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4674 && is_a
<bb_vec_info
> (vinfo
)
4675 && (opdef
= vinfo
->lookup_def (orig_op
)))
4678 insert_after
= opdef
;
4680 insert_after
= get_later_stmt (insert_after
, opdef
);
4683 if (number_of_places_left_in_vector
== 0)
4686 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4687 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4688 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
4691 if (permute_results
.is_empty ())
4692 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
4693 elts
, number_of_vectors
,
4695 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
4697 if (!gimple_seq_empty_p (ctor_seq
))
4701 gimple_stmt_iterator gsi
;
4702 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
4704 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
4705 gsi_insert_seq_before (&gsi
, ctor_seq
,
4706 GSI_CONTINUE_LINKING
);
4708 else if (!stmt_ends_bb_p (insert_after
->stmt
))
4710 gsi
= gsi_for_stmt (insert_after
->stmt
);
4711 gsi_insert_seq_after (&gsi
, ctor_seq
,
4712 GSI_CONTINUE_LINKING
);
4716 /* When we want to insert after a def where the
4717 defining stmt throws then insert on the fallthru
4719 edge e
= find_fallthru_edge
4720 (gimple_bb (insert_after
->stmt
)->succs
);
4722 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
4723 gcc_assert (!new_bb
);
4727 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
4730 voprnds
.quick_push (vec_cst
);
4731 insert_after
= NULL
;
4732 number_of_places_left_in_vector
= nunits
;
4734 elts
.new_vector (vector_type
, nunits
, 1);
4735 elts
.quick_grow (nunits
);
4740 /* Since the vectors are created in the reverse order, we should invert
4742 vec_num
= voprnds
.length ();
4743 for (j
= vec_num
; j
!= 0; j
--)
4745 vop
= voprnds
[j
- 1];
4746 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4749 /* In case that VF is greater than the unrolling factor needed for the SLP
4750 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4751 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4752 to replicate the vectors. */
4753 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
4754 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
4756 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4759 /* Get the Ith vectorized definition from SLP_NODE. */
4762 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
4764 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
4765 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
4767 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
4770 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4773 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
4775 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
4776 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
4779 gimple
*vec_def_stmt
;
4780 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
4781 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
4784 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
4787 /* Get N vectorized definitions for SLP_NODE. */
4790 vect_get_slp_defs (vec_info
*,
4791 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
4794 n
= SLP_TREE_CHILDREN (slp_node
).length ();
4796 for (unsigned i
= 0; i
< n
; ++i
)
4798 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
4799 vec
<tree
> vec_defs
= vNULL
;
4800 vect_get_slp_defs (child
, &vec_defs
);
4801 vec_oprnds
->quick_push (vec_defs
);
4805 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4806 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4807 permute statements for the SLP node NODE. */
4810 vect_transform_slp_perm_load (vec_info
*vinfo
,
4811 slp_tree node
, vec
<tree
> dr_chain
,
4812 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
4813 bool analyze_only
, unsigned *n_perms
)
4815 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4817 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4818 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
4819 unsigned int mask_element
;
4822 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4825 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4827 mode
= TYPE_MODE (vectype
);
4828 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4830 /* Initialize the vect stmts of NODE to properly insert the generated
4833 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
4834 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
4835 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
4837 /* Generate permutation masks for every NODE. Number of masks for each NODE
4838 is equal to GROUP_SIZE.
4839 E.g., we have a group of three nodes with three loads from the same
4840 location in each node, and the vector size is 4. I.e., we have a
4841 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4842 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4843 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4846 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4847 The last mask is illegal since we assume two operands for permute
4848 operation, and the mask element values can't be outside that range.
4849 Hence, the last mask must be converted into {2,5,5,5}.
4850 For the first two permutations we need the first and the second input
4851 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4852 we need the second and the third vectors: {b1,c1,a2,b2} and
4855 int vect_stmts_counter
= 0;
4856 unsigned int index
= 0;
4857 int first_vec_index
= -1;
4858 int second_vec_index
= -1;
4862 vec_perm_builder mask
;
4863 unsigned int nelts_to_build
;
4864 unsigned int nvectors_per_build
;
4865 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
4866 && multiple_p (nunits
, group_size
));
4869 /* A single vector contains a whole number of copies of the node, so:
4870 (a) all permutes can use the same mask; and
4871 (b) the permutes only need a single vector input. */
4872 mask
.new_vector (nunits
, group_size
, 3);
4873 nelts_to_build
= mask
.encoded_nelts ();
4874 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
4878 /* We need to construct a separate mask for each vector statement. */
4879 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
4880 if (!nunits
.is_constant (&const_nunits
)
4881 || !vf
.is_constant (&const_vf
))
4883 mask
.new_vector (const_nunits
, const_nunits
, 1);
4884 nelts_to_build
= const_vf
* group_size
;
4885 nvectors_per_build
= 1;
4888 unsigned int count
= mask
.encoded_nelts ();
4889 mask
.quick_grow (count
);
4890 vec_perm_indices indices
;
4892 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
4894 unsigned int iter_num
= j
/ group_size
;
4895 unsigned int stmt_num
= j
% group_size
;
4896 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
4897 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
4900 first_vec_index
= 0;
4905 /* Enforced before the loop when !repeating_p. */
4906 unsigned int const_nunits
= nunits
.to_constant ();
4907 vec_index
= i
/ const_nunits
;
4908 mask_element
= i
% const_nunits
;
4909 if (vec_index
== first_vec_index
4910 || first_vec_index
== -1)
4912 first_vec_index
= vec_index
;
4914 else if (vec_index
== second_vec_index
4915 || second_vec_index
== -1)
4917 second_vec_index
= vec_index
;
4918 mask_element
+= const_nunits
;
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4924 "permutation requires at "
4925 "least three vectors %G",
4927 gcc_assert (analyze_only
);
4931 gcc_assert (mask_element
< 2 * const_nunits
);
4934 if (mask_element
!= index
)
4936 mask
[index
++] = mask_element
;
4938 if (index
== count
&& !noop_p
)
4940 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
4941 if (!can_vec_perm_const_p (mode
, indices
))
4943 if (dump_enabled_p ())
4945 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4947 "unsupported vect permute { ");
4948 for (i
= 0; i
< count
; ++i
)
4950 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4951 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4953 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4955 gcc_assert (analyze_only
);
4966 tree mask_vec
= NULL_TREE
;
4969 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4971 if (second_vec_index
== -1)
4972 second_vec_index
= first_vec_index
;
4974 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
4976 /* Generate the permute statement if necessary. */
4977 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
4978 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
4982 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
4984 = vect_create_destination_var (gimple_assign_lhs (stmt
),
4986 perm_dest
= make_ssa_name (perm_dest
);
4988 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4989 first_vec
, second_vec
,
4991 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
4995 /* If mask was NULL_TREE generate the requested
4996 identity transform. */
4997 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
4999 /* Store the vector statement in NODE. */
5000 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5005 first_vec_index
= -1;
5006 second_vec_index
= -1;
5015 /* Vectorize the SLP permutations in NODE as specified
5016 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5017 child number and lane number.
5018 Interleaving of two two-lane two-child SLP subtrees (not supported):
5019 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5020 A blend of two four-lane two-child SLP subtrees:
5021 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5022 Highpart of a four-lane one-child SLP subtree (not supported):
5023 [ { 0, 2 }, { 0, 3 } ]
5024 Where currently only a subset is supported by code generating below. */
5027 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5028 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5030 tree vectype
= SLP_TREE_VECTYPE (node
);
5032 /* ??? We currently only support all same vector input and output types
5033 while the SLP IL should really do a concat + select and thus accept
5034 arbitrary mismatches. */
5037 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5038 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5040 if (dump_enabled_p ())
5041 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5042 "Unsupported lane permutation\n");
5046 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5047 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5048 if (dump_enabled_p ())
5050 dump_printf_loc (MSG_NOTE
, vect_location
,
5051 "vectorizing permutation");
5052 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5053 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5054 dump_printf (MSG_NOTE
, "\n");
5057 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5058 if (!nunits
.is_constant ())
5060 unsigned HOST_WIDE_INT vf
= 1;
5061 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5062 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5064 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5065 gcc_assert (multiple_p (olanes
, nunits
));
5067 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5068 from the { SLP operand, scalar lane } permutation as recorded in the
5069 SLP node as intermediate step. This part should already work
5070 with SLP children with arbitrary number of lanes. */
5071 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5072 auto_vec
<unsigned> active_lane
;
5073 vperm
.create (olanes
);
5074 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5075 for (unsigned i
= 0; i
< vf
; ++i
)
5077 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5079 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5080 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5081 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5082 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5083 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5084 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5086 /* Advance to the next group. */
5087 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5088 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5091 if (dump_enabled_p ())
5093 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5094 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5096 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5097 dump_printf (MSG_NOTE
, ",");
5098 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5099 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5100 vperm
[i
].first
.second
);
5102 dump_printf (MSG_NOTE
, "\n");
5105 /* We can only handle two-vector permutes, everything else should
5106 be lowered on the SLP level. The following is closely inspired
5107 by vect_transform_slp_perm_load and is supposed to eventually
5109 ??? As intermediate step do code-gen in the SLP tree representation
5111 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5112 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5113 unsigned int const_nunits
= nunits
.to_constant ();
5114 unsigned int index
= 0;
5115 unsigned int mask_element
;
5116 vec_perm_builder mask
;
5117 mask
.new_vector (const_nunits
, const_nunits
, 1);
5118 unsigned int count
= mask
.encoded_nelts ();
5119 mask
.quick_grow (count
);
5120 vec_perm_indices indices
;
5121 unsigned nperms
= 0;
5122 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5124 mask_element
= vperm
[i
].second
;
5125 if (first_vec
.first
== -1U
5126 || first_vec
== vperm
[i
].first
)
5127 first_vec
= vperm
[i
].first
;
5128 else if (second_vec
.first
== -1U
5129 || second_vec
== vperm
[i
].first
)
5131 second_vec
= vperm
[i
].first
;
5132 mask_element
+= const_nunits
;
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5138 "permutation requires at "
5139 "least three vectors");
5144 mask
[index
++] = mask_element
;
5148 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5150 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5152 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5154 if (dump_enabled_p ())
5156 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5158 "unsupported vect permute { ");
5159 for (i
= 0; i
< count
; ++i
)
5161 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5162 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5164 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5174 if (second_vec
.first
== -1U)
5175 second_vec
= first_vec
;
5177 /* Generate the permute statement if necessary. */
5178 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5180 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5182 tree perm_dest
= make_ssa_name (vectype
);
5185 slp_tree second_node
5186 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5188 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5189 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5190 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5191 first_def
, second_def
,
5195 /* We need a copy here in case the def was external. */
5196 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5197 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5198 /* Store the vector statement in NODE. */
5199 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5203 first_vec
= std::make_pair (-1U, -1U);
5204 second_vec
= std::make_pair (-1U, -1U);
5209 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5214 /* Vectorize SLP NODE. */
5217 vect_schedule_slp_node (vec_info
*vinfo
,
5218 slp_tree node
, slp_instance instance
)
5220 gimple_stmt_iterator si
;
5224 /* For existing vectors there's nothing to do. */
5225 if (SLP_TREE_VEC_DEFS (node
).exists ())
5228 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5230 /* Vectorize externals and constants. */
5231 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5232 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
5234 /* ??? vectorizable_shift can end up using a scalar operand which is
5235 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5236 node in this case. */
5237 if (!SLP_TREE_VECTYPE (node
))
5240 vect_create_constant_vectors (vinfo
, node
);
5244 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5246 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
5247 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
5249 if (dump_enabled_p ())
5250 dump_printf_loc (MSG_NOTE
, vect_location
,
5251 "------>vectorizing SLP node starting from: %G",
5254 if (STMT_VINFO_DATA_REF (stmt_info
)
5255 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5257 /* Vectorized loads go before the first scalar load to make it
5258 ready early, vectorized stores go before the last scalar
5259 stmt which is where all uses are ready. */
5260 stmt_vec_info last_stmt_info
= NULL
;
5261 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
5262 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
5263 else /* DR_IS_WRITE */
5264 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
5265 si
= gsi_for_stmt (last_stmt_info
->stmt
);
5267 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
5268 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
5269 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
5270 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5272 /* For PHI node vectorization we do not use the insertion iterator. */
5277 /* Emit other stmts after the children vectorized defs which is
5278 earliest possible. */
5279 gimple
*last_stmt
= NULL
;
5280 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5281 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5283 /* For fold-left reductions we are retaining the scalar
5284 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5285 set so the representation isn't perfect. Resort to the
5286 last scalar def here. */
5287 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
5289 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
5290 == cycle_phi_info_type
);
5291 gphi
*phi
= as_a
<gphi
*>
5292 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
5294 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
5297 /* We are emitting all vectorized stmts in the same place and
5298 the last one is the last.
5299 ??? Unless we have a load permutation applied and that
5300 figures to re-use an earlier generated load. */
5303 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
5305 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5308 else if (!SLP_TREE_VECTYPE (child
))
5310 /* For externals we use unvectorized at all scalar defs. */
5313 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
5314 if (TREE_CODE (def
) == SSA_NAME
5315 && !SSA_NAME_IS_DEFAULT_DEF (def
))
5317 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
5319 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
5325 /* For externals we have to look at all defs since their
5326 insertion place is decided per vector. But beware
5327 of pre-existing vectors where we need to make sure
5328 we do not insert before the region boundary. */
5329 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
5330 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
5331 last_stmt
= gsi_stmt (gsi_after_labels
5332 (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]));
5337 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
5338 if (TREE_CODE (vdef
) == SSA_NAME
5339 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
5341 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
5343 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5348 /* This can happen when all children are pre-existing vectors or
5351 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
5352 if (is_a
<gphi
*> (last_stmt
))
5353 si
= gsi_after_labels (gimple_bb (last_stmt
));
5356 si
= gsi_for_stmt (last_stmt
);
5361 bool done_p
= false;
5363 /* Handle purely internal nodes. */
5364 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5366 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5367 be shared with different SLP nodes (but usually it's the same
5368 operation apart from the case the stmt is only there for denoting
5369 the actual scalar lane defs ...). So do not call vect_transform_stmt
5370 but open-code it here (partly). */
5371 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
5376 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
5379 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5380 For loop vectorization this is done in vectorizable_call, but for SLP
5381 it needs to be deferred until end of vect_schedule_slp, because multiple
5382 SLP instances may refer to the same scalar stmt. */
5385 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
5386 slp_tree node
, hash_set
<slp_tree
> &visited
)
5389 gimple_stmt_iterator gsi
;
5393 stmt_vec_info stmt_info
;
5395 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5398 if (visited
.add (node
))
5401 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5402 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
5404 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5406 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
5407 if (!stmt
|| gimple_bb (stmt
) == NULL
)
5409 if (is_pattern_stmt_p (stmt_info
)
5410 || !PURE_SLP_STMT (stmt_info
))
5412 lhs
= gimple_call_lhs (stmt
);
5413 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
5414 gsi
= gsi_for_stmt (stmt
);
5415 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
5416 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
5421 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
5423 hash_set
<slp_tree
> visited
;
5424 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
5427 /* Vectorize the instance root. */
5430 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
5432 gassign
*rstmt
= NULL
;
5434 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
5439 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5441 tree vect_lhs
= gimple_get_lhs (child_stmt
);
5442 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5443 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
5444 TREE_TYPE (vect_lhs
)))
5445 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
5447 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
5451 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
5453 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5456 vec
<constructor_elt
, va_gc
> *v
;
5457 vec_alloc (v
, nelts
);
5459 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5461 CONSTRUCTOR_APPEND_ELT (v
,
5463 gimple_get_lhs (child_stmt
));
5465 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5466 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
5467 tree r_constructor
= build_constructor (rtype
, v
);
5468 rstmt
= gimple_build_assign (lhs
, r_constructor
);
5473 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
5474 gsi_replace (&rgsi
, rstmt
, true);
5484 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5487 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
5488 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
5489 int &maxdfs
, vec
<slp_tree
> &stack
)
5492 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
5493 gcc_assert (!existed_p
);
5495 info
->lowlink
= maxdfs
;
5496 info
->on_stack
= true;
5498 stack
.safe_push (node
);
5502 /* ??? We're keeping SLP_TREE_CHILDREN of externalized nodes. */
5503 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
5504 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5508 slp_scc_info
*child_info
= scc_info
.get (child
);
5511 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
5512 /* Recursion might have re-allocated the node. */
5513 info
= scc_info
.get (node
);
5514 child_info
= scc_info
.get (child
);
5515 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
5517 else if (child_info
->on_stack
)
5518 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
5520 if (info
->lowlink
!= info
->dfs
)
5524 if (stack
.last () == node
)
5527 info
->on_stack
= false;
5528 vect_schedule_slp_node (vinfo
, node
, instance
);
5532 int last_idx
= stack
.length () - 1;
5533 while (stack
[last_idx
] != node
)
5535 /* We can break the cycle at PHIs who have at least one child
5536 code generated. Then we could re-start the DFS walk until
5537 all nodes in the SCC are covered (we might have new entries
5538 for only back-reachable nodes). But it's simpler to just
5539 iterate and schedule those that are ready. */
5540 auto_vec
<slp_tree
, 4> phis_to_fixup
;
5541 unsigned todo
= stack
.length () - last_idx
;
5544 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
5546 slp_tree entry
= stack
[idx
];
5549 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
5550 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
5552 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
5559 else if (scc_info
.get (child
)->on_stack
)
5577 vect_schedule_slp_node (vinfo
, entry
, instance
);
5578 scc_info
.get (entry
)->on_stack
= false;
5582 phis_to_fixup
.safe_push (entry
);
5588 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5590 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
5592 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
5595 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
5597 unsigned dest_idx
= e
->dest_idx
;
5598 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
5599 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
5601 /* Simply fill all args. */
5602 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
5603 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
5604 vect_get_slp_vect_def (child
, i
),
5605 e
, gimple_phi_arg_location (phi
, dest_idx
));
5610 stack
.truncate (last_idx
);
5613 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5616 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
5618 slp_instance instance
;
5621 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
5623 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5625 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5626 if (dump_enabled_p ())
5628 dump_printf_loc (MSG_NOTE
, vect_location
,
5629 "Vectorizing SLP tree:\n");
5630 if (SLP_INSTANCE_ROOT_STMT (instance
))
5631 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
5632 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
5633 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5634 SLP_INSTANCE_TREE (instance
));
5636 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5637 have a PHI be the node breaking the cycle. */
5638 auto_vec
<slp_tree
> stack
;
5639 if (!scc_info
.get (node
))
5640 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
5642 if (SLP_INSTANCE_ROOT_STMT (instance
))
5643 vectorize_slp_instance_root_stmt (node
, instance
);
5645 if (dump_enabled_p ())
5646 dump_printf_loc (MSG_NOTE
, vect_location
,
5647 "vectorizing stmts using SLP.\n");
5650 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5652 slp_tree root
= SLP_INSTANCE_TREE (instance
);
5653 stmt_vec_info store_info
;
5656 /* Remove scalar call stmts. Do not do this for basic-block
5657 vectorization as not all uses may be vectorized.
5658 ??? Why should this be necessary? DCE should be able to
5659 remove the stmts itself.
5660 ??? For BB vectorization we can as well remove scalar
5661 stmts starting from the SLP tree root if they have no
5663 if (is_a
<loop_vec_info
> (vinfo
))
5664 vect_remove_slp_scalar_calls (vinfo
, root
);
5666 /* Remove vectorized stores original scalar stmts. */
5667 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
5669 if (!STMT_VINFO_DATA_REF (store_info
)
5670 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
5673 store_info
= vect_orig_stmt (store_info
);
5674 /* Free the attached stmt_vec_info and remove the stmt. */
5675 vinfo
->remove_stmt (store_info
);