1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
52 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
53 slp_tree
, stmt_vector_for_cost
*);
55 object_allocator
<_slp_tree
> *slp_tree_pool
;
58 _slp_tree::operator new (size_t n
)
60 gcc_assert (n
== sizeof (_slp_tree
));
61 return slp_tree_pool
->allocate_raw ();
65 _slp_tree::operator delete (void *node
, size_t n
)
67 gcc_assert (n
== sizeof (_slp_tree
));
68 slp_tree_pool
->remove_raw (node
);
72 /* Initialize a SLP node. */
74 _slp_tree::_slp_tree ()
76 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
77 SLP_TREE_SCALAR_OPS (this) = vNULL
;
78 SLP_TREE_VEC_STMTS (this) = vNULL
;
79 SLP_TREE_VEC_DEFS (this) = vNULL
;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL
;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
85 SLP_TREE_CODE (this) = ERROR_MARK
;
86 SLP_TREE_VECTYPE (this) = NULL_TREE
;
87 SLP_TREE_REPRESENTATIVE (this) = NULL
;
88 SLP_TREE_REF_COUNT (this) = 1;
93 /* Tear down a SLP node. */
95 _slp_tree::~_slp_tree ()
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
109 vect_free_slp_tree (slp_tree node
)
114 if (--SLP_TREE_REF_COUNT (node
) != 0)
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
119 vect_free_slp_tree (child
);
124 /* Return a location suitable for dumpings related to the SLP instance. */
127 _slp_instance::location () const
130 return root_stmt
->stmt
;
132 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
136 /* Free the memory allocated for the SLP instance. */
139 vect_free_slp_instance (slp_instance instance
)
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
142 SLP_INSTANCE_LOADS (instance
).release ();
143 instance
->subgraph_entries
.release ();
144 instance
->cost_vec
.release ();
149 /* Create an SLP node for SCALAR_STMTS. */
152 vect_create_new_slp_node (slp_tree node
,
153 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
155 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
156 SLP_TREE_CHILDREN (node
).create (nops
);
157 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
158 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
159 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
163 /* Create an SLP node for SCALAR_STMTS. */
166 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
168 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
171 /* Create an SLP node for OPS. */
174 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
176 SLP_TREE_SCALAR_OPS (node
) = ops
;
177 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
178 SLP_TREE_LANES (node
) = ops
.length ();
182 /* Create an SLP node for OPS. */
185 vect_create_new_slp_node (vec
<tree
> ops
)
187 return vect_create_new_slp_node (new _slp_tree
, ops
);
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
194 typedef struct _slp_oprnd_info
196 /* Def-stmts for the operands. */
197 vec
<stmt_vec_info
> def_stmts
;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
204 enum vect_def_type first_dt
;
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
211 static vec
<slp_oprnd_info
>
212 vect_create_oprnd_info (int nops
, int group_size
)
215 slp_oprnd_info oprnd_info
;
216 vec
<slp_oprnd_info
> oprnds_info
;
218 oprnds_info
.create (nops
);
219 for (i
= 0; i
< nops
; i
++)
221 oprnd_info
= XNEW (struct _slp_oprnd_info
);
222 oprnd_info
->def_stmts
.create (group_size
);
223 oprnd_info
->ops
.create (group_size
);
224 oprnd_info
->first_dt
= vect_uninitialized_def
;
225 oprnd_info
->first_op_type
= NULL_TREE
;
226 oprnd_info
->any_pattern
= false;
227 oprnds_info
.quick_push (oprnd_info
);
234 /* Free operands info. */
237 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
240 slp_oprnd_info oprnd_info
;
242 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
244 oprnd_info
->def_stmts
.release ();
245 oprnd_info
->ops
.release ();
246 XDELETE (oprnd_info
);
249 oprnds_info
.release ();
253 /* Return true if STMTS contains a pattern statement. */
256 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
258 stmt_vec_info stmt_info
;
260 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
261 if (is_pattern_stmt_p (stmt_info
))
266 /* Return true when all lanes in the external or constant NODE have
270 vect_slp_tree_uniform_p (slp_tree node
)
272 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
280 tree op
, first
= NULL_TREE
;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
284 else if (!operand_equal_p (first
, op
, 0))
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
296 stmt_vec_info first_stmt_info
)
298 stmt_vec_info next_stmt_info
= first_stmt_info
;
301 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
306 if (next_stmt_info
== stmt_info
)
308 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
310 result
+= DR_GROUP_GAP (next_stmt_info
);
312 while (next_stmt_info
);
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
324 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
325 tree elt_type
, unsigned int *nvectors_out
,
326 tree
*vector_type_out
,
329 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
330 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
333 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
334 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
335 unsigned int nvectors
= 1;
338 scalar_int_mode int_mode
;
339 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
340 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type
= build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode
), 1);
346 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
348 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
350 GET_MODE_SIZE (base_vector_mode
)))
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
356 vec_perm_builder
sel1 (nelts
, 2, 3);
357 vec_perm_builder
sel2 (nelts
, 2, 3);
358 poly_int64 half_nelts
= exact_div (nelts
, 2);
359 for (unsigned int i
= 0; i
< 3; ++i
)
362 sel1
.quick_push (i
+ nelts
);
363 sel2
.quick_push (half_nelts
+ i
);
364 sel2
.quick_push (half_nelts
+ i
+ nelts
);
366 vec_perm_indices
indices1 (sel1
, 2, nelts
);
367 vec_perm_indices
indices2 (sel2
, 2, nelts
);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
372 *nvectors_out
= nvectors
;
374 *vector_type_out
= vector_type
;
377 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
379 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
386 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
392 /* Return true if DTA and DTB match. */
395 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
398 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
399 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
415 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
417 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
418 vec
<slp_oprnd_info
> *oprnds_info
)
420 stmt_vec_info stmt_info
= stmts
[stmt_num
];
422 unsigned int i
, number_of_oprnds
;
423 enum vect_def_type dt
= vect_uninitialized_def
;
424 slp_oprnd_info oprnd_info
;
425 int first_op_idx
= 1;
426 unsigned int commutative_op
= -1U;
427 bool first_op_cond
= false;
428 bool first
= stmt_num
== 0;
430 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
432 number_of_oprnds
= gimple_call_num_args (stmt
);
434 if (gimple_call_internal_p (stmt
))
436 internal_fn ifn
= gimple_call_internal_fn (stmt
);
437 commutative_op
= first_commutative_argument (ifn
);
439 /* Masked load, only look at mask. */
440 if (ifn
== IFN_MASK_LOAD
)
442 number_of_oprnds
= 1;
443 /* Mask operand index. */
448 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
450 enum tree_code code
= gimple_assign_rhs_code (stmt
);
451 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code
== COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
457 first_op_cond
= true;
461 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
463 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
464 number_of_oprnds
= gimple_phi_num_args (stmt
);
468 bool swapped
= (swap
!= 0);
469 bool backedge
= false;
470 gcc_assert (!swapped
|| first_op_cond
);
471 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
472 for (i
= 0; i
< number_of_oprnds
; i
++)
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map
= maps
[swap
];
481 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
482 first_op_idx
), map
[i
]);
484 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
486 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
488 oprnd
= gimple_phi_arg_def (stmt
, i
);
489 backedge
= dominated_by_p (CDI_DOMINATORS
,
490 gimple_phi_arg_edge (stmt
, i
)->src
,
491 gimple_bb (stmt_info
->stmt
));
494 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
495 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
496 oprnd
= TREE_OPERAND (oprnd
, 0);
498 oprnd_info
= (*oprnds_info
)[i
];
500 stmt_vec_info def_stmt_info
;
501 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
505 "Build SLP failed: can't analyze def for %T\n",
513 oprnd_info
->def_stmts
.quick_push (NULL
);
514 oprnd_info
->ops
.quick_push (NULL_TREE
);
515 oprnd_info
->first_dt
= vect_uninitialized_def
;
519 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
520 oprnd_info
->any_pattern
= true;
522 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
523 oprnd_info
->ops
.quick_push (oprnd
);
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
531 && dts
[i
] == vect_external_def
532 && is_a
<bb_vec_info
> (vinfo
)
533 && TREE_CODE (oprnd
) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
535 && !dominated_by_p (CDI_DOMINATORS
,
536 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd
);
548 tree type
= TREE_TYPE (oprnd
);
550 if ((dt
== vect_constant_def
551 || dt
== vect_external_def
)
552 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
553 && (TREE_CODE (type
) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd
);
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info
)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
568 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
570 dts
[i
] = dt
= vect_reduction_def
;
572 /* Check the types of the definition. */
575 case vect_external_def
:
576 case vect_constant_def
:
577 case vect_internal_def
:
578 case vect_reduction_def
:
579 case vect_induction_def
:
580 case vect_nested_cycle
:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
587 "Build SLP failed: illegal type of def %T\n",
592 oprnd_info
->first_dt
= dt
;
593 oprnd_info
->first_op_type
= type
;
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i
= 0; i
< number_of_oprnds
;)
608 oprnd_info
= (*oprnds_info
)[i
];
610 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
611 oprnd
= oprnd_info
->ops
[stmt_num
];
612 tree type
= TREE_TYPE (oprnd
);
614 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
618 "Build SLP failed: different operand types\n");
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
628 && !(oprnd_info
->first_dt
== vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info
)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
632 && !STMT_VINFO_DATA_REF (def_stmt_info
)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
635 || (!STMT_VINFO_DATA_REF (stmt_info
)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
638 || STMT_VINFO_DATA_REF (def_stmt_info
)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
641 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i
== commutative_op
&& !swapped
646 && (!is_a
<bb_vec_info
> (vinfo
)
647 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
649 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
650 || vect_def_types_match
651 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE
, vect_location
,
655 "trying swapped operands\n");
656 std::swap (dts
[i
], dts
[i
+1]);
657 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
658 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
659 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
660 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
665 if (is_a
<bb_vec_info
> (vinfo
)
666 && !oprnd_info
->any_pattern
)
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
672 "treating operand as external\n");
673 oprnd_info
->first_dt
= dt
= vect_external_def
;
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
679 "Build SLP failed: different types\n");
684 /* Make sure to demote the overall operand to external. */
685 if (dt
== vect_external_def
)
686 oprnd_info
->first_dt
= vect_external_def
;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt
== vect_internal_def
691 || dt
== vect_reduction_def
)
692 && oprnd_info
->first_dt
== vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info
)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
695 && !STMT_VINFO_DATA_REF (def_stmt_info
)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
699 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
700 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE
, vect_location
,
711 "swapped operands to match def types in %G",
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
723 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
725 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
727 return useless_type_conversion_p (vectype
, old_vectype
);
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
734 stmt_vec_info member_info
= first_info
;
735 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
736 if (is_pattern_stmt_p (member_info
)
737 && !useless_type_conversion_p (vectype
,
738 STMT_VINFO_VECTYPE (member_info
)))
743 for (member_info
= first_info
; member_info
;
744 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
745 STMT_VINFO_VECTYPE (member_info
) = vectype
;
749 else if (!is_pattern_stmt_p (stmt_info
))
751 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
755 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info
->stmt
);
760 dump_printf_loc (MSG_NOTE
, vect_location
,
761 " old vector type: %T\n", old_vectype
);
762 dump_printf_loc (MSG_NOTE
, vect_location
,
763 " new vector type: %T\n", vectype
);
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
772 compatible_calls_p (gcall
*call1
, gcall
*call2
)
774 unsigned int nargs
= gimple_call_num_args (call1
);
775 if (nargs
!= gimple_call_num_args (call2
))
778 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
781 if (gimple_call_internal_p (call1
))
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
784 TREE_TYPE (gimple_call_lhs (call2
))))
786 for (unsigned int i
= 0; i
< nargs
; ++i
)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
788 TREE_TYPE (gimple_call_arg (call2
, i
))))
793 if (!operand_equal_p (gimple_call_fn (call1
),
794 gimple_call_fn (call2
), 0))
797 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
811 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
812 unsigned int group_size
,
813 tree vectype
, poly_uint64
*max_nunits
)
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
819 "Build SLP failed: unsupported data-type in %G\n",
821 /* Fatal mismatch. */
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
828 unsigned HOST_WIDE_INT const_nunits
;
829 if (is_a
<bb_vec_info
> (vinfo
)
830 && (!nunits
.is_constant (&const_nunits
)
831 || const_nunits
> group_size
))
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits
, vectype
);
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
861 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
862 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
863 poly_uint64
*max_nunits
, bool *matches
,
864 bool *two_operators
, tree
*node_vectype
)
867 stmt_vec_info first_stmt_info
= stmts
[0];
868 enum tree_code first_stmt_code
= ERROR_MARK
;
869 enum tree_code alt_stmt_code
= ERROR_MARK
;
870 enum tree_code rhs_code
= ERROR_MARK
;
871 enum tree_code first_cond_code
= ERROR_MARK
;
873 bool need_same_oprnds
= false;
874 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
877 machine_mode optab_op2_mode
;
878 machine_mode vec_mode
;
879 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
880 bool first_stmt_load_p
= false, load_p
= false;
881 bool first_stmt_phi_p
= false, phi_p
= false;
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info
;
885 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
887 gimple
*stmt
= stmt_info
->stmt
;
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
894 /* Fail to vectorize statements marked as unvectorizable, throw
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
897 || stmt_can_throw_internal (cfun
, stmt
)
898 || gimple_has_volatile_ops (stmt
))
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
902 "Build SLP failed: unvectorizable statement %G",
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
910 /* Fatal mismatch. */
915 lhs
= gimple_get_lhs (stmt
);
916 if (lhs
== NULL_TREE
)
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt
);
922 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
924 /* Fatal mismatch. */
930 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
931 &nunits_vectype
, group_size
)
933 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
934 nunits_vectype
, max_nunits
)))
936 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
938 /* Fatal mismatch. */
943 gcc_assert (vectype
);
945 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
948 rhs_code
= CALL_EXPR
;
950 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
952 else if ((gimple_call_internal_p (call_stmt
)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt
))))
955 || gimple_call_tail_p (call_stmt
)
956 || gimple_call_noreturn_p (call_stmt
)
957 || !gimple_call_nothrow_p (call_stmt
)
958 || gimple_call_chain (call_stmt
))
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
962 "Build SLP failed: unsupported call type %G",
964 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
966 /* Fatal mismatch. */
971 else if (gimple_code (stmt
) == GIMPLE_PHI
)
973 rhs_code
= ERROR_MARK
;
978 rhs_code
= gimple_assign_rhs_code (stmt
);
979 load_p
= gimple_vuse (stmt
);
982 /* Check the operation. */
985 *node_vectype
= vectype
;
986 first_stmt_code
= rhs_code
;
987 first_stmt_load_p
= load_p
;
988 first_stmt_phi_p
= phi_p
;
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
993 || rhs_code
== LROTATE_EXPR
994 || rhs_code
== RROTATE_EXPR
)
996 vec_mode
= TYPE_MODE (vectype
);
998 /* First see if we have a vector/vector shift. */
999 optab
= optab_for_tree_code (rhs_code
, vectype
,
1003 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab
= optab_for_tree_code (rhs_code
, vectype
,
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1016 /* Fatal mismatch. */
1020 icode
= (int) optab_handler (optab
, vec_mode
);
1021 if (icode
== CODE_FOR_nothing
)
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1029 /* Fatal mismatch. */
1033 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1034 if (!VECTOR_MODE_P (optab_op2_mode
))
1036 need_same_oprnds
= true;
1037 first_op1
= gimple_assign_rhs2 (stmt
);
1041 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1043 need_same_oprnds
= true;
1044 first_op1
= gimple_assign_rhs2 (stmt
);
1047 && rhs_code
== BIT_FIELD_REF
)
1049 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1050 if (TREE_CODE (vec
) != SSA_NAME
1051 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
1053 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1065 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1067 need_same_oprnds
= true;
1068 first_op1
= gimple_call_arg (call_stmt
, 1);
1073 if (first_stmt_code
!= rhs_code
1074 && alt_stmt_code
== ERROR_MARK
)
1075 alt_stmt_code
= rhs_code
;
1076 if ((first_stmt_code
!= rhs_code
1077 && (first_stmt_code
!= IMAGPART_EXPR
1078 || rhs_code
!= REALPART_EXPR
)
1079 && (first_stmt_code
!= REALPART_EXPR
1080 || rhs_code
!= IMAGPART_EXPR
)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code
== PLUS_EXPR
1084 || first_stmt_code
== MINUS_EXPR
)
1085 && (alt_stmt_code
== PLUS_EXPR
1086 || alt_stmt_code
== MINUS_EXPR
)
1087 && rhs_code
== alt_stmt_code
)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1089 && (first_stmt_code
== ARRAY_REF
1090 || first_stmt_code
== BIT_FIELD_REF
1091 || first_stmt_code
== INDIRECT_REF
1092 || first_stmt_code
== COMPONENT_REF
1093 || first_stmt_code
== MEM_REF
)))
1094 || first_stmt_load_p
!= load_p
1095 || first_stmt_phi_p
!= phi_p
)
1097 if (dump_enabled_p ())
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt
);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1103 "original stmt %G", first_stmt_info
->stmt
);
1109 if (need_same_oprnds
)
1111 tree other_op1
= (call_stmt
1112 ? gimple_call_arg (call_stmt
, 1)
1113 : gimple_assign_rhs2 (stmt
));
1114 if (!operand_equal_p (first_op1
, other_op1
, 0))
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt
);
1125 && first_stmt_code
== BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt
);
1137 if (!load_p
&& rhs_code
== CALL_EXPR
)
1139 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1140 as_a
<gcall
*> (stmt
)))
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1144 "Build SLP failed: different calls in %G",
1152 && (gimple_bb (first_stmt_info
->stmt
)
1153 != gimple_bb (stmt_info
->stmt
)))
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1157 "Build SLP failed: different BB for PHI "
1163 if (!types_compatible_p (vectype
, *node_vectype
))
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1167 "Build SLP failed: different vector type "
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1177 if (REFERENCE_CLASS_P (lhs
))
1185 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1186 if (prev_first_load
)
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load
!= first_load
)
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1203 prev_first_load
= first_load
;
1205 } /* Grouped access. */
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1213 "Build SLP failed: not grouped load %G", stmt
);
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1218 /* Fatal mismatch. */
1223 /* Not memory operation. */
1225 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1229 && rhs_code
!= VIEW_CONVERT_EXPR
1230 && rhs_code
!= CALL_EXPR
1231 && rhs_code
!= BIT_FIELD_REF
)
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1235 "Build SLP failed: operation unsupported %G",
1237 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1239 /* Fatal mismatch. */
1244 if (rhs_code
== COND_EXPR
)
1246 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1247 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1248 enum tree_code swap_code
= ERROR_MARK
;
1249 enum tree_code invert_code
= ERROR_MARK
;
1252 first_cond_code
= TREE_CODE (cond_expr
);
1253 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1255 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1256 swap_code
= swap_tree_comparison (cond_code
);
1257 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1260 if (first_cond_code
== cond_code
)
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code
== swap_code
)
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code
== invert_code
)
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1272 "Build SLP failed: different"
1273 " operation %G", stmt
);
1283 for (i
= 0; i
< group_size
; ++i
)
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code
!= ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1292 *two_operators
= true;
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1303 typedef vec
<stmt_vec_info
> value_type
;
1304 typedef vec
<stmt_vec_info
> compare_type
;
1305 static inline hashval_t
hash (value_type
);
1306 static inline bool equal (value_type existing
, value_type candidate
);
1307 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1308 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1309 static const bool empty_zero_p
= true;
1310 static inline void mark_empty (value_type
&x
) { x
.release (); }
1311 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1312 static inline void remove (value_type
&x
) { x
.release (); }
1315 bst_traits::hash (value_type x
)
1318 for (unsigned i
= 0; i
< x
.length (); ++i
)
1319 h
.add_int (gimple_uid (x
[i
]->stmt
));
1323 bst_traits::equal (value_type existing
, value_type candidate
)
1325 if (existing
.length () != candidate
.length ())
1327 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1328 if (existing
[i
] != candidate
[i
])
1333 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1334 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1335 scalar_stmts_to_slp_tree_map_t
;
1338 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1339 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1340 poly_uint64
*max_nunits
,
1341 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1342 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1345 vect_build_slp_tree (vec_info
*vinfo
,
1346 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1347 poly_uint64
*max_nunits
,
1348 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1349 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1351 if (slp_tree
*leader
= bst_map
->get (stmts
))
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1355 *leader
? "" : "failed ", *leader
);
1358 SLP_TREE_REF_COUNT (*leader
)++;
1359 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res
= new _slp_tree
;
1367 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1368 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1369 bst_map
->put (stmts
.copy (), res
);
1371 poly_uint64 this_max_nunits
= 1;
1372 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1374 matches
, npermutes
, tree_size
, bst_map
);
1377 bool existed_p
= bst_map
->put (stmts
, NULL
);
1378 gcc_assert (existed_p
);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1382 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1383 vect_free_slp_tree (res
);
1387 gcc_assert (res_
== res
);
1388 res
->max_nunits
= this_max_nunits
;
1389 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res
)++;
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1404 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1405 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1406 poly_uint64
*max_nunits
,
1407 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1408 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1410 unsigned nops
, i
, this_tree_size
= 0;
1411 poly_uint64 this_max_nunits
= *max_nunits
;
1415 stmt_vec_info stmt_info
= stmts
[0];
1416 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1417 nops
= gimple_call_num_args (stmt
);
1418 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1420 nops
= gimple_num_ops (stmt
) - 1;
1421 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1424 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1425 nops
= gimple_phi_num_args (phi
);
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1431 bool *skip_args
= XALLOCAVEC (bool, nops
);
1432 memset (skip_args
, 0, nops
);
1433 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1434 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1436 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1437 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1439 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1443 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1444 /* Induction from different IVs is not supported. */
1445 if (def_type
== vect_induction_def
)
1447 stmt_vec_info other_info
;
1448 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1449 if (stmt_info
!= other_info
)
1452 /* Induction PHIs are leafs. */
1454 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1455 SLP_TREE_VECTYPE (node
) = vectype
;
1456 SLP_TREE_CHILDREN (node
).quick_grow_cleared (nops
);
1459 else if (def_type
== vect_reduction_def
1460 || def_type
== vect_double_reduction_def
1461 || def_type
== vect_nested_cycle
)
1463 /* Else def types have to match. */
1464 stmt_vec_info other_info
;
1465 bool all_same
= true;
1466 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1468 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1470 if (other_info
!= stmt_info
)
1473 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1474 /* Reduction initial values are not explicitely represented. */
1475 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1476 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1477 /* Reduction chain backedge defs are filled manually.
1478 ??? Need a better way to identify a SLP reduction chain PHI.
1479 Or a better overall way to SLP match those. */
1480 if (all_same
&& def_type
== vect_reduction_def
)
1481 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1483 else if (def_type
!= vect_internal_def
)
1488 bool two_operators
= false;
1489 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1490 tree vectype
= NULL_TREE
;
1491 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1492 &this_max_nunits
, matches
, &two_operators
,
1496 /* If the SLP node is a load, terminate the recursion unless masked. */
1497 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1498 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1500 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1503 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1508 *max_nunits
= this_max_nunits
;
1510 node
= vect_create_new_slp_node (node
, stmts
, 0);
1511 SLP_TREE_VECTYPE (node
) = vectype
;
1512 /* And compute the load permutation. Whether it is actually
1513 a permutation depends on the unrolling factor which is
1515 vec
<unsigned> load_permutation
;
1517 stmt_vec_info load_info
;
1518 load_permutation
.create (group_size
);
1519 stmt_vec_info first_stmt_info
1520 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1521 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1523 int load_place
= vect_get_place_in_interleaving_chain
1524 (load_info
, first_stmt_info
);
1525 gcc_assert (load_place
!= -1);
1526 load_permutation
.safe_push (load_place
);
1528 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1532 else if (gimple_assign_single_p (stmt_info
->stmt
)
1533 && !gimple_vuse (stmt_info
->stmt
)
1534 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1536 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1537 the same SSA name vector of a compatible type to vectype. */
1538 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1539 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1540 stmt_vec_info estmt_info
;
1541 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1543 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1544 tree bfref
= gimple_assign_rhs1 (estmt
);
1546 if (!known_eq (bit_field_size (bfref
),
1547 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1548 || !constant_multiple_p (bit_field_offset (bfref
),
1549 bit_field_size (bfref
), &lane
))
1554 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1556 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1557 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1558 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1559 /* We are always building a permutation node even if it is an identity
1560 permute to shield the rest of the vectorizer from the odd node
1561 representing an actual vector without any scalar ops.
1562 ??? We could hide it completely with making the permute node
1564 node
= vect_create_new_slp_node (node
, stmts
, 1);
1565 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1566 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1567 SLP_TREE_VECTYPE (node
) = vectype
;
1568 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1572 /* Get at the operands, verifying they are compatible. */
1573 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1574 slp_oprnd_info oprnd_info
;
1575 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1577 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1578 stmts
, i
, &oprnds_info
);
1580 matches
[(res
== -1) ? 0 : i
] = false;
1584 for (i
= 0; i
< group_size
; ++i
)
1587 vect_free_oprnd_info (oprnds_info
);
1592 auto_vec
<slp_tree
, 4> children
;
1594 stmt_info
= stmts
[0];
1596 /* Create SLP_TREE nodes for the definition node/s. */
1597 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1602 /* We're skipping certain operands from processing, for example
1603 outer loop reduction initial defs. */
1606 children
.safe_push (NULL
);
1610 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1612 /* COND_EXPR have one too many eventually if the condition
1614 gcc_assert (i
== 3 && nops
== 4);
1618 if (is_a
<bb_vec_info
> (vinfo
)
1619 && oprnd_info
->first_dt
== vect_internal_def
1620 && !oprnd_info
->any_pattern
)
1622 /* For BB vectorization, if all defs are the same do not
1623 bother to continue the build along the single-lane
1624 graph but use a splat of the scalar value. */
1625 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1626 for (j
= 1; j
< group_size
; ++j
)
1627 if (oprnd_info
->def_stmts
[j
] != first_def
)
1630 /* But avoid doing this for loads where we may be
1631 able to CSE things, unless the stmt is not
1633 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1634 || !gimple_vuse (first_def
->stmt
)))
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_NOTE
, vect_location
,
1638 "Using a splat of the uniform operand\n");
1639 oprnd_info
->first_dt
= vect_external_def
;
1643 if (oprnd_info
->first_dt
== vect_external_def
1644 || oprnd_info
->first_dt
== vect_constant_def
)
1646 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1647 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1648 oprnd_info
->ops
= vNULL
;
1649 children
.safe_push (invnode
);
1653 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1654 group_size
, &this_max_nunits
,
1656 &this_tree_size
, bst_map
)) != NULL
)
1658 oprnd_info
->def_stmts
= vNULL
;
1659 children
.safe_push (child
);
1663 /* If the SLP build for operand zero failed and operand zero
1664 and one can be commutated try that for the scalar stmts
1665 that failed the match. */
1667 /* A first scalar stmt mismatch signals a fatal mismatch. */
1669 /* ??? For COND_EXPRs we can swap the comparison operands
1670 as well as the arms under some constraints. */
1672 && oprnds_info
[1]->first_dt
== vect_internal_def
1673 && is_gimple_assign (stmt_info
->stmt
)
1674 /* Swapping operands for reductions breaks assumptions later on. */
1675 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1676 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1677 /* Do so only if the number of not successful permutes was nor more
1678 than a cut-ff as re-trying the recursive match on
1679 possibly each level of the tree would expose exponential
1683 /* See whether we can swap the matching or the non-matching
1685 bool swap_not_matching
= true;
1688 for (j
= 0; j
< group_size
; ++j
)
1690 if (matches
[j
] != !swap_not_matching
)
1692 stmt_vec_info stmt_info
= stmts
[j
];
1693 /* Verify if we can swap operands of this stmt. */
1694 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1696 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1698 if (!swap_not_matching
)
1700 swap_not_matching
= false;
1705 while (j
!= group_size
);
1707 /* Swap mismatched definition stmts. */
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_NOTE
, vect_location
,
1710 "Re-trying with swapped operands of stmts ");
1711 for (j
= 0; j
< group_size
; ++j
)
1712 if (matches
[j
] == !swap_not_matching
)
1714 std::swap (oprnds_info
[0]->def_stmts
[j
],
1715 oprnds_info
[1]->def_stmts
[j
]);
1716 std::swap (oprnds_info
[0]->ops
[j
],
1717 oprnds_info
[1]->ops
[j
]);
1718 if (dump_enabled_p ())
1719 dump_printf (MSG_NOTE
, "%d ", j
);
1721 if (dump_enabled_p ())
1722 dump_printf (MSG_NOTE
, "\n");
1723 /* And try again with scratch 'matches' ... */
1724 bool *tem
= XALLOCAVEC (bool, group_size
);
1725 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1726 group_size
, &this_max_nunits
,
1728 &this_tree_size
, bst_map
)) != NULL
)
1730 oprnd_info
->def_stmts
= vNULL
;
1731 children
.safe_push (child
);
1734 /* We do not undo the swapping here since it might still be
1735 the better order for the second operand in case we build
1736 the first one from scalars below. */
1741 /* If the SLP build failed and we analyze a basic-block
1742 simply treat nodes we fail to build as externally defined
1743 (and thus build vectors from the scalar defs).
1744 The cost model will reject outright expensive cases.
1745 ??? This doesn't treat cases where permutation ultimatively
1746 fails (or we don't try permutation below). Ideally we'd
1747 even compute a permutation that will end up with the maximum
1749 if (is_a
<bb_vec_info
> (vinfo
)
1750 /* ??? Rejecting patterns this way doesn't work. We'd have to
1751 do extra work to cancel the pattern so the uses see the
1753 && !is_pattern_stmt_p (stmt_info
)
1754 && !oprnd_info
->any_pattern
)
1756 /* But if there's a leading vector sized set of matching stmts
1757 fail here so we can split the group. This matches the condition
1758 vect_analyze_slp_instance uses. */
1759 /* ??? We might want to split here and combine the results to support
1760 multiple vector sizes better. */
1761 for (j
= 0; j
< group_size
; ++j
)
1764 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1766 if (dump_enabled_p ())
1767 dump_printf_loc (MSG_NOTE
, vect_location
,
1768 "Building vector operands from scalars\n");
1770 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1771 children
.safe_push (child
);
1772 oprnd_info
->ops
= vNULL
;
1777 gcc_assert (child
== NULL
);
1778 FOR_EACH_VEC_ELT (children
, j
, child
)
1780 vect_free_slp_tree (child
);
1781 vect_free_oprnd_info (oprnds_info
);
1785 vect_free_oprnd_info (oprnds_info
);
1787 /* If we have all children of a child built up from uniform scalars
1788 or does more than one possibly expensive vector construction then
1789 just throw that away, causing it built up from scalars.
1790 The exception is the SLP node for the vector store. */
1791 if (is_a
<bb_vec_info
> (vinfo
)
1792 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1793 /* ??? Rejecting patterns this way doesn't work. We'd have to
1794 do extra work to cancel the pattern so the uses see the
1796 && !is_pattern_stmt_p (stmt_info
))
1800 bool all_uniform_p
= true;
1801 unsigned n_vector_builds
= 0;
1802 FOR_EACH_VEC_ELT (children
, j
, child
)
1806 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1807 all_uniform_p
= false;
1808 else if (!vect_slp_tree_uniform_p (child
))
1810 all_uniform_p
= false;
1811 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1815 if (all_uniform_p
|| n_vector_builds
> 1)
1819 FOR_EACH_VEC_ELT (children
, j
, child
)
1821 vect_free_slp_tree (child
);
1823 if (dump_enabled_p ())
1824 dump_printf_loc (MSG_NOTE
, vect_location
,
1825 "Building parent vector operands from "
1826 "scalars instead\n");
1831 *tree_size
+= this_tree_size
+ 1;
1832 *max_nunits
= this_max_nunits
;
1836 /* ??? We'd likely want to either cache in bst_map sth like
1837 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1838 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1839 explicit stmts to put in so the keying on 'stmts' doesn't
1840 work (but we have the same issue with nodes that use 'ops'). */
1841 slp_tree one
= new _slp_tree
;
1842 slp_tree two
= new _slp_tree
;
1843 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1844 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1845 SLP_TREE_VECTYPE (one
) = vectype
;
1846 SLP_TREE_VECTYPE (two
) = vectype
;
1847 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1848 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1850 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1851 SLP_TREE_REF_COUNT (child
)++;
1853 /* Here we record the original defs since this
1854 node represents the final lane configuration. */
1855 node
= vect_create_new_slp_node (node
, stmts
, 2);
1856 SLP_TREE_VECTYPE (node
) = vectype
;
1857 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1858 SLP_TREE_CHILDREN (node
).quick_push (one
);
1859 SLP_TREE_CHILDREN (node
).quick_push (two
);
1860 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1861 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1862 enum tree_code ocode
= ERROR_MARK
;
1863 stmt_vec_info ostmt_info
;
1865 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1867 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1868 if (gimple_assign_rhs_code (ostmt
) != code0
)
1870 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1871 ocode
= gimple_assign_rhs_code (ostmt
);
1875 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1877 SLP_TREE_CODE (one
) = code0
;
1878 SLP_TREE_CODE (two
) = ocode
;
1879 SLP_TREE_LANES (one
) = stmts
.length ();
1880 SLP_TREE_LANES (two
) = stmts
.length ();
1881 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1882 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1886 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1887 SLP_TREE_VECTYPE (node
) = vectype
;
1888 SLP_TREE_CHILDREN (node
).splice (children
);
1892 /* Dump a single SLP tree NODE. */
1895 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1900 stmt_vec_info stmt_info
;
1903 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1904 dump_user_location_t user_loc
= loc
.get_user_location ();
1905 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1906 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1908 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1911 estimated_poly_value (node
->max_nunits
),
1912 SLP_TREE_REF_COUNT (node
));
1913 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1914 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1915 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1918 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1919 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1920 dump_printf (metadata
, "%T%s ", op
,
1921 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1922 dump_printf (metadata
, "}\n");
1924 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1926 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1927 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1928 dump_printf (dump_kind
, " %u", j
);
1929 dump_printf (dump_kind
, " }\n");
1931 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1933 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1934 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1935 dump_printf (dump_kind
, " %u[%u]",
1936 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1937 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1938 dump_printf (dump_kind
, " }\n");
1940 if (SLP_TREE_CHILDREN (node
).is_empty ())
1942 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1943 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1944 dump_printf (dump_kind
, " %p", (void *)child
);
1945 dump_printf (dump_kind
, "\n");
1949 debug (slp_tree node
)
1951 debug_dump_context ctx
;
1952 vect_print_slp_tree (MSG_NOTE
,
1953 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1957 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1960 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1961 slp_tree node
, hash_set
<slp_tree
> &visited
)
1966 if (visited
.add (node
))
1969 vect_print_slp_tree (dump_kind
, loc
, node
);
1971 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1973 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1977 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1980 hash_set
<slp_tree
> visited
;
1981 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1984 /* Mark the tree rooted at NODE with PURE_SLP. */
1987 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1990 stmt_vec_info stmt_info
;
1993 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1996 if (visited
.add (node
))
1999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2000 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2002 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2004 vect_mark_slp_stmts (child
, visited
);
2008 vect_mark_slp_stmts (slp_tree node
)
2010 hash_set
<slp_tree
> visited
;
2011 vect_mark_slp_stmts (node
, visited
);
2014 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2017 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2020 stmt_vec_info stmt_info
;
2023 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2026 if (visited
.add (node
))
2029 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2031 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2032 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2033 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2036 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2038 vect_mark_slp_stmts_relevant (child
, visited
);
2042 vect_mark_slp_stmts_relevant (slp_tree node
)
2044 hash_set
<slp_tree
> visited
;
2045 vect_mark_slp_stmts_relevant (node
, visited
);
2049 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2052 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2053 hash_set
<slp_tree
> &visited
)
2055 if (!node
|| visited
.add (node
))
2058 if (SLP_TREE_CHILDREN (node
).length () == 0)
2060 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2062 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2064 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2065 loads
.safe_push (node
);
2071 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2072 vect_gather_slp_loads (loads
, child
, visited
);
2077 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
2079 hash_set
<slp_tree
> visited
;
2080 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
2084 /* Find the last store in SLP INSTANCE. */
2087 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2089 stmt_vec_info last
= NULL
;
2090 stmt_vec_info stmt_vinfo
;
2092 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2094 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2095 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2101 /* Find the first stmt in NODE. */
2104 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2106 stmt_vec_info first
= NULL
;
2107 stmt_vec_info stmt_vinfo
;
2109 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2111 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2113 || get_later_stmt (stmt_vinfo
, first
) == first
)
2120 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2121 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2122 (also containing the first GROUP1_SIZE stmts, since stores are
2123 consecutive), the second containing the remainder.
2124 Return the first stmt in the second group. */
2126 static stmt_vec_info
2127 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2129 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2130 gcc_assert (group1_size
> 0);
2131 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2132 gcc_assert (group2_size
> 0);
2133 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2135 stmt_vec_info stmt_info
= first_vinfo
;
2136 for (unsigned i
= group1_size
; i
> 1; i
--)
2138 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2139 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2141 /* STMT is now the last element of the first group. */
2142 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2143 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2145 DR_GROUP_SIZE (group2
) = group2_size
;
2146 for (stmt_info
= group2
; stmt_info
;
2147 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2149 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2150 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2153 /* For the second group, the DR_GROUP_GAP is that before the original group,
2154 plus skipping over the first vector. */
2155 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2157 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2158 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2160 if (dump_enabled_p ())
2161 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2162 group1_size
, group2_size
);
2167 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2168 statements and a vector of NUNITS elements. */
2171 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2173 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2176 enum slp_instance_kind
{
2177 slp_inst_kind_store
,
2178 slp_inst_kind_reduc_group
,
2179 slp_inst_kind_reduc_chain
,
2184 vect_analyze_slp_instance (vec_info
*vinfo
,
2185 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2186 stmt_vec_info stmt_info
, unsigned max_tree_size
);
2188 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2189 of KIND. Return true if successful. */
2192 vect_build_slp_instance (vec_info
*vinfo
,
2193 slp_instance_kind kind
,
2194 vec
<stmt_vec_info
> scalar_stmts
,
2195 stmt_vec_info root_stmt_info
,
2196 unsigned max_tree_size
,
2197 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2198 /* ??? We need stmt_info for group splitting. */
2199 stmt_vec_info stmt_info_
)
2201 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_NOTE
, vect_location
,
2204 "Starting SLP discovery for\n");
2205 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2206 dump_printf_loc (MSG_NOTE
, vect_location
,
2207 " %G", scalar_stmts
[i
]->stmt
);
2210 /* Build the tree for the SLP instance. */
2211 unsigned int group_size
= scalar_stmts
.length ();
2212 bool *matches
= XALLOCAVEC (bool, group_size
);
2213 unsigned npermutes
= 0;
2214 poly_uint64 max_nunits
= 1;
2215 unsigned tree_size
= 0;
2217 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2218 &max_nunits
, matches
, &npermutes
,
2219 &tree_size
, bst_map
);
2222 /* Calculate the unrolling factor based on the smallest type. */
2223 poly_uint64 unrolling_factor
2224 = calculate_unrolling_factor (max_nunits
, group_size
);
2226 if (maybe_ne (unrolling_factor
, 1U)
2227 && is_a
<bb_vec_info
> (vinfo
))
2229 unsigned HOST_WIDE_INT const_max_nunits
;
2230 if (!max_nunits
.is_constant (&const_max_nunits
)
2231 || const_max_nunits
> group_size
)
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2235 "Build SLP failed: store group "
2236 "size not a multiple of the vector size "
2237 "in basic block SLP\n");
2238 vect_free_slp_tree (node
);
2241 /* Fatal mismatch. */
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE
, vect_location
,
2244 "SLP discovery succeeded but node needs "
2246 memset (matches
, true, group_size
);
2247 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2248 vect_free_slp_tree (node
);
2252 /* Create a new SLP instance. */
2253 slp_instance new_instance
= XNEW (class _slp_instance
);
2254 SLP_INSTANCE_TREE (new_instance
) = node
;
2255 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2256 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2257 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2258 new_instance
->reduc_phis
= NULL
;
2259 new_instance
->cost_vec
= vNULL
;
2260 new_instance
->subgraph_entries
= vNULL
;
2262 vect_gather_slp_loads (new_instance
, node
);
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_NOTE
, vect_location
,
2265 "SLP size %u vs. limit %u.\n",
2266 tree_size
, max_tree_size
);
2268 /* Check whether any load is possibly permuted. */
2270 bool loads_permuted
= false;
2271 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2273 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2276 stmt_vec_info load_info
;
2277 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2278 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2280 loads_permuted
= true;
2285 /* If the loads and stores can be handled with load/store-lane
2286 instructions do not generate this SLP instance. */
2287 if (is_a
<loop_vec_info
> (vinfo
)
2289 && kind
== slp_inst_kind_store
2290 && vect_store_lanes_supported
2291 (STMT_VINFO_VECTYPE (scalar_stmts
[0]), group_size
, false))
2294 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2296 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2297 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2298 /* Use SLP for strided accesses (or if we can't load-lanes). */
2299 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2300 || ! vect_load_lanes_supported
2301 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2302 DR_GROUP_SIZE (stmt_vinfo
), false))
2305 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2307 if (dump_enabled_p ())
2308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2309 "Built SLP cancelled: can use "
2310 "load/store-lanes\n");
2311 vect_free_slp_instance (new_instance
);
2316 /* Fixup SLP reduction chains. */
2317 if (kind
== slp_inst_kind_reduc_chain
)
2319 /* If this is a reduction chain with a conversion in front
2320 amend the SLP tree with a node for that. */
2322 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2323 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2325 /* Get at the conversion stmt - we know it's the single use
2326 of the last stmt of the reduction chain. */
2327 use_operand_p use_p
;
2328 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2329 &use_p
, &scalar_def
);
2331 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2332 next_info
= vect_stmt_to_vectorize (next_info
);
2333 scalar_stmts
= vNULL
;
2334 scalar_stmts
.create (group_size
);
2335 for (unsigned i
= 0; i
< group_size
; ++i
)
2336 scalar_stmts
.quick_push (next_info
);
2337 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2338 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2339 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2340 SLP_INSTANCE_TREE (new_instance
) = conv
;
2341 /* We also have to fake this conversion stmt as SLP reduction
2342 group so we don't have to mess with too much code
2344 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2345 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2347 /* Fill the backedge child of the PHI SLP node. The
2348 general matching code cannot find it because the
2349 scalar code does not reflect how we vectorize the
2351 use_operand_p use_p
;
2352 imm_use_iterator imm_iter
;
2353 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2354 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2355 gimple_get_lhs (scalar_def
))
2356 /* There are exactly two non-debug uses, the reduction
2357 PHI and the loop-closed PHI node. */
2358 if (!is_gimple_debug (USE_STMT (use_p
))
2359 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2361 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2362 stmt_vec_info phi_info
2363 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2364 for (unsigned i
= 0; i
< group_size
; ++i
)
2365 phis
.quick_push (phi_info
);
2366 slp_tree
*phi_node
= bst_map
->get (phis
);
2367 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2368 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2369 = SLP_INSTANCE_TREE (new_instance
);
2370 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2374 vinfo
->slp_instances
.safe_push (new_instance
);
2376 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2377 the number of scalar stmts in the root in a few places.
2378 Verify that assumption holds. */
2379 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2380 .length () == group_size
);
2382 if (dump_enabled_p ())
2384 dump_printf_loc (MSG_NOTE
, vect_location
,
2385 "Final SLP tree for instance %p:\n", new_instance
);
2386 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2387 SLP_INSTANCE_TREE (new_instance
));
2395 /* Failed to SLP. */
2396 /* Free the allocated memory. */
2397 scalar_stmts
.release ();
2400 stmt_vec_info stmt_info
= stmt_info_
;
2401 /* Try to break the group up into pieces. */
2402 if (kind
== slp_inst_kind_store
)
2404 /* ??? We could delay all the actual splitting of store-groups
2405 until after SLP discovery of the original group completed.
2406 Then we can recurse to vect_build_slp_instance directly. */
2407 for (i
= 0; i
< group_size
; i
++)
2411 /* For basic block SLP, try to break the group up into multiples of
2413 if (is_a
<bb_vec_info
> (vinfo
)
2414 && (i
> 1 && i
< group_size
))
2417 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2418 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2419 1 << floor_log2 (i
));
2420 unsigned HOST_WIDE_INT const_nunits
;
2422 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2424 /* Split into two groups at the first vector boundary. */
2425 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2426 unsigned group1_size
= i
& ~(const_nunits
- 1);
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE
, vect_location
,
2430 "Splitting SLP group at stmt %u\n", i
);
2431 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2433 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2435 /* Split the rest at the failure point and possibly
2436 re-analyze the remaining matching part if it has
2437 at least two lanes. */
2439 && (i
+ 1 < group_size
2440 || i
- group1_size
> 1))
2442 stmt_vec_info rest2
= rest
;
2443 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2444 if (i
- group1_size
> 1)
2445 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2446 rest2
, max_tree_size
);
2448 /* Re-analyze the non-matching tail if it has at least
2450 if (i
+ 1 < group_size
)
2451 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2452 rest
, max_tree_size
);
2457 /* For loop vectorization split into arbitrary pieces of size > 1. */
2458 if (is_a
<loop_vec_info
> (vinfo
)
2459 && (i
> 1 && i
< group_size
))
2461 unsigned group1_size
= i
;
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_NOTE
, vect_location
,
2465 "Splitting SLP group at stmt %u\n", i
);
2467 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2469 /* Loop vectorization cannot handle gaps in stores, make sure
2470 the split group appears as strided. */
2471 STMT_VINFO_STRIDED_P (rest
) = 1;
2472 DR_GROUP_GAP (rest
) = 0;
2473 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2474 DR_GROUP_GAP (stmt_info
) = 0;
2476 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2478 if (i
+ 1 < group_size
)
2479 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2480 rest
, max_tree_size
);
2485 /* Even though the first vector did not all match, we might be able to SLP
2486 (some) of the remainder. FORNOW ignore this possibility. */
2489 /* Failed to SLP. */
2490 if (dump_enabled_p ())
2491 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2496 /* Analyze an SLP instance starting from a group of grouped stores. Call
2497 vect_build_slp_tree to build a tree of packed stmts if possible.
2498 Return FALSE if it's impossible to SLP any stmt in the loop. */
2501 vect_analyze_slp_instance (vec_info
*vinfo
,
2502 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2503 stmt_vec_info stmt_info
, unsigned max_tree_size
)
2505 unsigned int group_size
;
2507 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2508 vec
<stmt_vec_info
> scalar_stmts
;
2509 slp_instance_kind kind
;
2511 if (is_a
<bb_vec_info
> (vinfo
))
2512 vect_location
= stmt_info
->stmt
;
2513 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2515 kind
= slp_inst_kind_store
;
2516 group_size
= DR_GROUP_SIZE (stmt_info
);
2518 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2520 kind
= slp_inst_kind_reduc_chain
;
2521 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2522 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2524 else if (is_gimple_assign (stmt_info
->stmt
)
2525 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2527 kind
= slp_inst_kind_ctor
;
2528 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2532 kind
= slp_inst_kind_reduc_group
;
2533 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2534 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2537 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2538 scalar_stmts
.create (group_size
);
2539 stmt_vec_info next_info
= stmt_info
;
2540 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2542 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2545 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2546 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2549 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2551 /* Collect the reduction stmts and store them in
2552 SLP_TREE_SCALAR_STMTS. */
2555 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2556 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2558 /* Mark the first element of the reduction chain as reduction to properly
2559 transform the node. In the reduction analysis phase only the last
2560 element of the chain is marked as reduction. */
2561 STMT_VINFO_DEF_TYPE (stmt_info
)
2562 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2563 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2564 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2566 else if (kind
== slp_inst_kind_ctor
)
2568 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2570 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2572 if (TREE_CODE (val
) == SSA_NAME
)
2574 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2575 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2576 /* Value is defined in another basic block. */
2579 def_info
= vect_stmt_to_vectorize (def_info
);
2580 scalar_stmts
.safe_push (def_info
);
2585 if (dump_enabled_p ())
2586 dump_printf_loc (MSG_NOTE
, vect_location
,
2587 "Analyzing vectorizable constructor: %G\n",
2592 /* Collect reduction statements. */
2593 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2594 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2595 if (STMT_VINFO_RELEVANT_P (next_info
)
2596 || STMT_VINFO_LIVE_P (next_info
))
2597 scalar_stmts
.quick_push (next_info
);
2600 /* Build the tree for the SLP instance. */
2601 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2602 kind
== slp_inst_kind_ctor
2605 bst_map
, stmt_info
);
2607 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2608 where we should do store group splitting. */
2613 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2614 trees of packed scalar stmts if SLP is possible. */
2617 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2620 stmt_vec_info first_element
;
2622 DUMP_VECT_SCOPE ("vect_analyze_slp");
2624 scalar_stmts_to_slp_tree_map_t
*bst_map
2625 = new scalar_stmts_to_slp_tree_map_t ();
2627 /* Find SLP sequences starting from groups of grouped stores. */
2628 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2629 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2631 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2633 /* Find SLP sequences starting from reduction chains. */
2634 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2635 if (! STMT_VINFO_RELEVANT_P (first_element
)
2636 && ! STMT_VINFO_LIVE_P (first_element
))
2638 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2641 /* Dissolve reduction chain group. */
2642 stmt_vec_info vinfo
= first_element
;
2643 stmt_vec_info last
= NULL
;
2646 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2647 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2648 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2652 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2653 /* It can be still vectorized as part of an SLP reduction. */
2654 loop_vinfo
->reductions
.safe_push (last
);
2657 /* Find SLP sequences starting from groups of reductions. */
2658 if (loop_vinfo
->reductions
.length () > 1)
2659 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2663 /* The map keeps a reference on SLP nodes built, release that. */
2664 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2665 it
!= bst_map
->end (); ++it
)
2667 vect_free_slp_tree ((*it
).second
);
2670 return opt_result::success ();
2673 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2676 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2677 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2682 if (visited
.add (node
))
2685 node
->vertex
= vertices
.length ();
2686 vertices
.safe_push (node
);
2689 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2693 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2696 leafs
.safe_push (node
->vertex
);
2699 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2702 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2705 hash_set
<slp_tree
> visited
;
2707 slp_instance instance
;
2708 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2709 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2713 /* Apply (reverse) bijectite PERM to VEC. */
2717 vect_slp_permute (vec
<unsigned> perm
,
2718 vec
<T
> &vec
, bool reverse
)
2720 auto_vec
<T
, 64> saved
;
2721 saved
.create (vec
.length ());
2722 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2723 saved
.quick_push (vec
[i
]);
2727 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2728 vec
[perm
[i
]] = saved
[i
];
2729 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2730 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2734 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2735 vec
[i
] = saved
[perm
[i
]];
2736 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2737 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2741 /* Return whether permutations PERM_A and PERM_B as recorded in the
2742 PERMS vector are equal. */
2745 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2746 int perm_a
, int perm_b
)
2748 return (perm_a
== perm_b
2749 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2750 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2751 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2754 /* Optimize the SLP graph of VINFO. */
2757 vect_optimize_slp (vec_info
*vinfo
)
2759 if (vinfo
->slp_instances
.is_empty ())
2764 auto_vec
<slp_tree
> vertices
;
2765 auto_vec
<int> leafs
;
2766 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
2768 struct graph
*slpg
= new_graph (vertices
.length ());
2769 FOR_EACH_VEC_ELT (vertices
, i
, node
)
2773 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2775 add_edge (slpg
, i
, child
->vertex
);
2778 /* Compute (reverse) postorder on the inverted graph. */
2780 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
2782 auto_sbitmap
n_visited (vertices
.length ());
2783 auto_sbitmap
n_materialize (vertices
.length ());
2784 auto_vec
<int> n_perm (vertices
.length ());
2785 auto_vec
<vec
<unsigned> > perms
;
2787 bitmap_clear (n_visited
);
2788 bitmap_clear (n_materialize
);
2789 n_perm
.quick_grow_cleared (vertices
.length ());
2790 perms
.safe_push (vNULL
); /* zero is no permute */
2792 /* Produce initial permutations. */
2793 for (i
= 0; i
< leafs
.length (); ++i
)
2796 slp_tree node
= vertices
[idx
];
2798 /* Handle externals and constants optimistically throughout the
2800 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
2801 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
2804 /* Loads are the only thing generating permutes and leafs do not
2805 change across iterations. */
2806 bitmap_set_bit (n_visited
, idx
);
2807 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2810 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2811 node unpermuted, record this permute. */
2812 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
2813 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
2815 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
2816 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
2817 bool any_permute
= false;
2818 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2820 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
2821 imin
= MIN (imin
, idx
);
2822 imax
= MAX (imax
, idx
);
2823 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
2826 /* If there's no permute no need to split one out. */
2829 /* If the span doesn't match we'd disrupt VF computation, avoid
2831 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
2834 /* For now only handle true permutes, like
2835 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2836 when permuting constants and invariants keeping the permute
2838 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
2839 bitmap_clear (load_index
);
2840 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2841 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
2843 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2844 if (!bitmap_bit_p (load_index
, j
))
2846 if (j
!= SLP_TREE_LANES (node
))
2849 vec
<unsigned> perm
= vNULL
;
2850 perm
.safe_grow (SLP_TREE_LANES (node
), true);
2851 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2852 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
2853 perms
.safe_push (perm
);
2854 n_perm
[idx
] = perms
.length () - 1;
2857 /* Propagate permutes along the graph and compute materialization points. */
2859 unsigned iteration
= 0;
2865 for (i
= vertices
.length (); i
> 0 ; --i
)
2868 slp_tree node
= vertices
[idx
];
2869 /* For leafs there's nothing to do - we've seeded permutes
2871 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2874 bitmap_set_bit (n_visited
, idx
);
2876 /* We cannot move a permute across a store. */
2877 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2879 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
2883 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
2884 succ
; succ
= succ
->succ_next
)
2886 int succ_idx
= succ
->dest
;
2887 /* Handle unvisited nodes optimistically. */
2888 /* ??? But for constants once we want to handle non-bijective
2889 permutes we have to verify the permute, when unifying lanes,
2890 will not unify different constants. For example see
2891 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2892 if (!bitmap_bit_p (n_visited
, succ_idx
))
2894 int succ_perm
= n_perm
[succ_idx
];
2895 /* Once we materialize succs permutation its output lanes
2896 appear unpermuted to us. */
2897 if (bitmap_bit_p (n_materialize
, succ_idx
))
2901 else if (succ_perm
== 0)
2906 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
2914 /* Pick up pre-computed leaf values. */
2916 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
2919 /* Make sure we eventually converge. */
2920 gcc_checking_assert (perm
== 0);
2923 bitmap_clear_bit (n_materialize
, idx
);
2930 /* Elide pruning at materialization points in the first
2931 iteration so every node was visited once at least. */
2935 /* Decide on permute materialization. Look whether there's
2936 a use (pred) edge that is permuted differently than us.
2937 In that case mark ourselves so the permutation is applied. */
2938 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
2939 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
2940 pred
; pred
= pred
->pred_next
)
2942 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
2943 int pred_perm
= n_perm
[pred
->src
];
2944 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
2946 all_preds_permuted
= false;
2950 if (!all_preds_permuted
)
2952 if (!bitmap_bit_p (n_materialize
, idx
))
2954 bitmap_set_bit (n_materialize
, idx
);
2958 while (changed
|| iteration
== 1);
2961 for (i
= 0; i
< vertices
.length (); ++i
)
2963 int perm
= n_perm
[i
];
2967 slp_tree node
= vertices
[i
];
2969 /* First permute invariant/external original successors. */
2972 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2974 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2977 /* If the vector is uniform there's nothing to do. */
2978 if (vect_slp_tree_uniform_p (child
))
2981 /* We can end up sharing some externals via two_operator
2982 handling. Be prepared to unshare those. */
2983 if (child
->refcnt
!= 1)
2985 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
2986 SLP_TREE_CHILDREN (node
)[j
] = child
2987 = vect_create_new_slp_node
2988 (SLP_TREE_SCALAR_OPS (child
).copy ());
2990 vect_slp_permute (perms
[perm
],
2991 SLP_TREE_SCALAR_OPS (child
), true);
2994 if (bitmap_bit_p (n_materialize
, i
))
2996 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2997 /* For loads simply drop the permutation, the load permutation
2998 already performs the desired permutation. */
3002 if (dump_enabled_p ())
3003 dump_printf_loc (MSG_NOTE
, vect_location
,
3004 "inserting permute node in place of %p\n",
3007 /* Make a copy of NODE and in-place change it to a
3008 VEC_PERM node to permute the lanes of the copy. */
3009 slp_tree copy
= new _slp_tree
;
3010 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3011 SLP_TREE_CHILDREN (node
) = vNULL
;
3012 SLP_TREE_SCALAR_STMTS (copy
)
3013 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3014 vect_slp_permute (perms
[perm
],
3015 SLP_TREE_SCALAR_STMTS (copy
), true);
3016 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3017 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3018 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3019 SLP_TREE_LANE_PERMUTATION (copy
)
3020 = SLP_TREE_LANE_PERMUTATION (node
);
3021 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3022 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3024 copy
->max_nunits
= node
->max_nunits
;
3025 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3026 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3027 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3029 /* Now turn NODE into a VEC_PERM. */
3030 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3031 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3032 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3033 SLP_TREE_LANE_PERMUTATION (node
)
3034 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3035 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3040 /* Apply the reverse permutation to our stmts. */
3041 vect_slp_permute (perms
[perm
],
3042 SLP_TREE_SCALAR_STMTS (node
), true);
3043 /* And to the load permutation, which we can simply
3044 make regular by design. */
3045 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3047 /* ??? When we handle non-bijective permutes the idea
3048 is that we can force the load-permutation to be
3049 { min, min + 1, min + 2, ... max }. But then the
3050 scalar defs might no longer match the lane content
3051 which means wrong-code with live lane vectorization.
3052 So we possibly have to have NULL entries for those. */
3053 vect_slp_permute (perms
[perm
],
3054 SLP_TREE_LOAD_PERMUTATION (node
), true);
3059 /* Free the perms vector used for propagation. */
3060 while (!perms
.is_empty ())
3061 perms
.pop ().release ();
3065 /* Now elide load permutations that are not necessary. */
3066 for (i
= 0; i
< leafs
.length (); ++i
)
3068 node
= vertices
[leafs
[i
]];
3069 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3072 /* In basic block vectorization we allow any subchain of an interleaving
3074 FORNOW: not in loop SLP because of realignment complications. */
3075 if (is_a
<bb_vec_info
> (vinfo
))
3077 bool subchain_p
= true;
3078 stmt_vec_info next_load_info
= NULL
;
3079 stmt_vec_info load_info
;
3081 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3084 && (next_load_info
!= load_info
3085 || DR_GROUP_GAP (load_info
) != 1))
3090 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3094 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3100 stmt_vec_info load_info
;
3101 bool this_load_permuted
= false;
3103 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3104 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3106 this_load_permuted
= true;
3109 stmt_vec_info first_stmt_info
3110 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3111 if (!this_load_permuted
3112 /* The load requires permutation when unrolling exposes
3113 a gap either because the group is larger than the SLP
3114 group-size or because there is a gap between the groups. */
3115 && (known_eq (LOOP_VINFO_VECT_FACTOR
3116 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3117 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3118 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3120 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3128 /* For each possible SLP instance decide whether to SLP it and calculate overall
3129 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3130 least one instance. */
3133 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3136 poly_uint64 unrolling_factor
= 1;
3137 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3138 slp_instance instance
;
3139 int decided_to_slp
= 0;
3141 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3143 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3145 /* FORNOW: SLP if you can. */
3146 /* All unroll factors have the form:
3148 GET_MODE_SIZE (vinfo->vector_mode) * X
3150 for some rational X, so they must have a common multiple. */
3152 = force_common_multiple (unrolling_factor
,
3153 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3155 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3156 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3157 loop-based vectorization. Such stmts will be marked as HYBRID. */
3158 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3162 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3164 if (decided_to_slp
&& dump_enabled_p ())
3166 dump_printf_loc (MSG_NOTE
, vect_location
,
3167 "Decided to SLP %d instances. Unrolling factor ",
3169 dump_dec (MSG_NOTE
, unrolling_factor
);
3170 dump_printf (MSG_NOTE
, "\n");
3173 return (decided_to_slp
> 0);
3176 /* Private data for vect_detect_hybrid_slp. */
3179 loop_vec_info loop_vinfo
;
3180 vec
<stmt_vec_info
> *worklist
;
3183 /* Walker for walk_gimple_op. */
3186 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3188 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3189 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3194 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3197 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3198 if (PURE_SLP_STMT (def_stmt_info
))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3202 def_stmt_info
->stmt
);
3203 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3204 dat
->worklist
->safe_push (def_stmt_info
);
3210 /* Find stmts that must be both vectorized and SLPed. */
3213 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3215 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3217 /* All stmts participating in SLP are marked pure_slp, all other
3218 stmts are loop_vect.
3219 First collect all loop_vect stmts into a worklist. */
3220 auto_vec
<stmt_vec_info
> worklist
;
3221 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
3223 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3224 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3227 gphi
*phi
= gsi
.phi ();
3228 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3229 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3230 worklist
.safe_push (stmt_info
);
3232 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
3235 gimple
*stmt
= gsi_stmt (gsi
);
3236 if (is_gimple_debug (stmt
))
3238 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3239 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3241 for (gimple_stmt_iterator gsi2
3242 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3243 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3245 stmt_vec_info patt_info
3246 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3247 if (!STMT_SLP_TYPE (patt_info
)
3248 && STMT_VINFO_RELEVANT (patt_info
))
3249 worklist
.safe_push (patt_info
);
3251 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3253 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3254 worklist
.safe_push (stmt_info
);
3258 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3259 mark any SLP vectorized stmt as hybrid.
3260 ??? We're visiting def stmts N times (once for each non-SLP and
3261 once for each hybrid-SLP use). */
3264 dat
.worklist
= &worklist
;
3265 dat
.loop_vinfo
= loop_vinfo
;
3266 memset (&wi
, 0, sizeof (wi
));
3267 wi
.info
= (void *)&dat
;
3268 while (!worklist
.is_empty ())
3270 stmt_vec_info stmt_info
= worklist
.pop ();
3271 /* Since SSA operands are not set up for pattern stmts we need
3272 to use walk_gimple_op. */
3274 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3279 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3281 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3282 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
)
3284 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3287 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3290 gphi
*phi
= si
.phi ();
3291 gimple_set_uid (phi
, 0);
3294 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3295 !gsi_end_p (gsi
); gsi_next (&gsi
))
3297 gimple
*stmt
= gsi_stmt (gsi
);
3298 gimple_set_uid (stmt
, 0);
3299 if (is_gimple_debug (stmt
))
3307 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3308 stmts in the basic block. */
3310 _bb_vec_info::~_bb_vec_info ()
3312 /* Reset region marker. */
3313 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3316 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3319 gphi
*phi
= si
.phi ();
3320 gimple_set_uid (phi
, -1);
3322 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3323 !gsi_end_p (gsi
); gsi_next (&gsi
))
3325 gimple
*stmt
= gsi_stmt (gsi
);
3326 gimple_set_uid (stmt
, -1);
3331 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3332 given then that child nodes have already been processed, and that
3333 their def types currently match their SLP node's def type. */
3336 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3337 slp_instance node_instance
,
3338 stmt_vector_for_cost
*cost_vec
)
3340 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3341 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3343 /* Calculate the number of vector statements to be created for the
3344 scalar stmts in this node. For SLP reductions it is equal to the
3345 number of vector statements in the children (which has already been
3346 calculated by the recursive call). Otherwise it is the number of
3347 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3348 VF divided by the number of elements in a vector. */
3349 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3350 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3352 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3353 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3355 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3356 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3363 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3364 vf
= loop_vinfo
->vectorization_factor
;
3367 unsigned int group_size
= SLP_TREE_LANES (node
);
3368 tree vectype
= SLP_TREE_VECTYPE (node
);
3369 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3370 = vect_get_num_vectors (vf
* group_size
, vectype
);
3373 /* Handle purely internal nodes. */
3374 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3375 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3377 if (is_a
<bb_vec_info
> (vinfo
)
3378 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3380 if (dump_enabled_p ())
3381 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3382 "desired vector type conflicts with earlier one "
3383 "for %G", stmt_info
->stmt
);
3388 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3389 node
, node_instance
, cost_vec
);
3392 /* Try to build NODE from scalars, returning true on success.
3393 NODE_INSTANCE is the SLP instance that contains NODE. */
3396 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3397 slp_instance node_instance
)
3399 stmt_vec_info stmt_info
;
3402 if (!is_a
<bb_vec_info
> (vinfo
)
3403 || node
== SLP_INSTANCE_TREE (node_instance
)
3404 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3405 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3408 if (dump_enabled_p ())
3409 dump_printf_loc (MSG_NOTE
, vect_location
,
3410 "Building vector operands of %p from scalars instead\n", node
);
3412 /* Don't remove and free the child nodes here, since they could be
3413 referenced by other structures. The analysis and scheduling phases
3414 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3415 unsigned int group_size
= SLP_TREE_LANES (node
);
3416 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3417 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3418 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3419 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3421 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3422 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3427 /* Compute the prologue cost for invariant or constant operands represented
3431 vect_prologue_cost_for_slp (slp_tree node
,
3432 stmt_vector_for_cost
*cost_vec
)
3434 /* There's a special case of an existing vector, that costs nothing. */
3435 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3436 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3438 /* Without looking at the actual initializer a vector of
3439 constants can be implemented as load from the constant pool.
3440 When all elements are the same we can use a splat. */
3441 tree vectype
= SLP_TREE_VECTYPE (node
);
3442 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3443 unsigned num_vects_to_check
;
3444 unsigned HOST_WIDE_INT const_nunits
;
3445 unsigned nelt_limit
;
3446 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3447 && ! multiple_p (const_nunits
, group_size
))
3449 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3450 nelt_limit
= const_nunits
;
3454 /* If either the vector has variable length or the vectors
3455 are composed of repeated whole groups we only need to
3456 cost construction once. All vectors will be the same. */
3457 num_vects_to_check
= 1;
3458 nelt_limit
= group_size
;
3460 tree elt
= NULL_TREE
;
3462 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3464 unsigned si
= j
% group_size
;
3466 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3467 /* ??? We're just tracking whether all operands of a single
3468 vector initializer are the same, ideally we'd check if
3469 we emitted the same one already. */
3470 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3473 if (nelt
== nelt_limit
)
3475 record_stmt_cost (cost_vec
, 1,
3476 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3477 ? (elt
? scalar_to_vec
: vec_construct
)
3479 NULL
, vectype
, 0, vect_prologue
);
3485 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3486 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3488 Return true if the operations are supported. */
3491 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3492 slp_instance node_instance
,
3493 hash_set
<slp_tree
> &visited_set
,
3494 vec
<slp_tree
> &visited_vec
,
3495 stmt_vector_for_cost
*cost_vec
)
3500 /* Assume we can code-generate all invariants. */
3502 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3503 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3506 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3508 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_NOTE
, vect_location
,
3510 "Failed cyclic SLP reference in %p", node
);
3513 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3515 /* If we already analyzed the exact same set of scalar stmts we're done.
3516 We share the generated vector stmts for those. */
3517 if (visited_set
.add (node
))
3519 visited_vec
.safe_push (node
);
3522 unsigned visited_rec_start
= visited_vec
.length ();
3523 unsigned cost_vec_rec_start
= cost_vec
->length ();
3524 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3526 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3527 visited_set
, visited_vec
,
3534 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3536 /* If analysis failed we have to pop all recursive visited nodes
3540 while (visited_vec
.length () >= visited_rec_start
)
3541 visited_set
.remove (visited_vec
.pop ());
3542 cost_vec
->truncate (cost_vec_rec_start
);
3545 /* When the node can be vectorized cost invariant nodes it references.
3546 This is not done in DFS order to allow the refering node
3547 vectorizable_* calls to nail down the invariant nodes vector type
3548 and possibly unshare it if it needs a different vector type than
3551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3553 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3554 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3555 /* Perform usual caching, note code-generation still
3556 code-gens these nodes multiple times but we expect
3557 to CSE them later. */
3558 && !visited_set
.add (child
))
3560 visited_vec
.safe_push (child
);
3561 /* ??? After auditing more code paths make a "default"
3562 and push the vector type from NODE to all children
3563 if it is not already set. */
3564 /* Compute the number of vectors to be generated. */
3565 tree vector_type
= SLP_TREE_VECTYPE (child
);
3568 /* For shifts with a scalar argument we don't need
3569 to cost or code-generate anything.
3570 ??? Represent this more explicitely. */
3571 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3572 == shift_vec_info_type
)
3576 unsigned group_size
= SLP_TREE_LANES (child
);
3578 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3579 vf
= loop_vinfo
->vectorization_factor
;
3580 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3581 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3582 /* And cost them. */
3583 vect_prologue_cost_for_slp (child
, cost_vec
);
3586 /* If this node or any of its children can't be vectorized, try pruning
3587 the tree here rather than felling the whole thing. */
3588 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3590 /* We'll need to revisit this for invariant costing and number
3591 of vectorized stmt setting. */
3599 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3600 region and that can be vectorized using vectorizable_live_operation
3601 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3602 scalar code computing it to be retained. */
3605 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3606 slp_instance instance
,
3607 stmt_vector_for_cost
*cost_vec
,
3608 hash_set
<stmt_vec_info
> &svisited
,
3609 hash_set
<slp_tree
> &visited
)
3611 if (visited
.add (node
))
3615 stmt_vec_info stmt_info
;
3616 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3617 bool all_visited
= true;
3618 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3620 if (svisited
.contains (stmt_info
))
3622 all_visited
= false;
3623 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3624 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3625 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3626 /* Only the pattern root stmt computes the original scalar value. */
3628 bool mark_visited
= true;
3629 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3630 ssa_op_iter op_iter
;
3631 def_operand_p def_p
;
3632 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3634 imm_use_iterator use_iter
;
3636 stmt_vec_info use_stmt_info
;
3637 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3638 if (!is_gimple_debug (use_stmt
))
3640 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3642 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3644 STMT_VINFO_LIVE_P (stmt_info
) = true;
3645 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3646 NULL
, node
, instance
, i
,
3648 /* ??? So we know we can vectorize the live stmt
3649 from one SLP node. If we cannot do so from all
3650 or none consistently we'd have to record which
3651 SLP node (and lane) we want to use for the live
3652 operation. So make sure we can code-generate
3654 mark_visited
= false;
3656 STMT_VINFO_LIVE_P (stmt_info
) = false;
3657 BREAK_FROM_IMM_USE_STMT (use_iter
);
3660 /* We have to verify whether we can insert the lane extract
3661 before all uses. The following is a conservative approximation.
3662 We cannot put this into vectorizable_live_operation because
3663 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3665 Note that while the fact that we emit code for loads at the
3666 first load should make this a non-problem leafs we construct
3667 from scalars are vectorized after the last scalar def.
3668 ??? If we'd actually compute the insert location during
3669 analysis we could use sth less conservative than the last
3670 scalar stmt in the node for the dominance check. */
3671 /* ??? What remains is "live" uses in vector CTORs in the same
3672 SLP graph which is where those uses can end up code-generated
3673 right after their definition instead of close to their original
3674 use. But that would restrict us to code-generate lane-extracts
3675 from the latest stmt in a node. So we compensate for this
3676 during code-generation, simply not replacing uses for those
3677 hopefully rare cases. */
3678 if (STMT_VINFO_LIVE_P (stmt_info
))
3679 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3680 if (!is_gimple_debug (use_stmt
)
3681 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3682 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3683 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3685 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3687 "Cannot determine insertion place for "
3689 STMT_VINFO_LIVE_P (stmt_info
) = false;
3690 mark_visited
= true;
3694 svisited
.add (stmt_info
);
3700 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3701 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3702 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3703 cost_vec
, svisited
, visited
);
3706 /* Analyze statements in SLP instances of VINFO. Return true if the
3707 operations are supported. */
3710 vect_slp_analyze_operations (vec_info
*vinfo
)
3712 slp_instance instance
;
3715 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3717 hash_set
<slp_tree
> visited
;
3718 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3720 auto_vec
<slp_tree
> visited_vec
;
3721 stmt_vector_for_cost cost_vec
;
3722 cost_vec
.create (2);
3723 if (is_a
<bb_vec_info
> (vinfo
))
3724 vect_location
= instance
->location ();
3725 if (!vect_slp_analyze_node_operations (vinfo
,
3726 SLP_INSTANCE_TREE (instance
),
3727 instance
, visited
, visited_vec
,
3729 /* Instances with a root stmt require vectorized defs for the
3731 || (SLP_INSTANCE_ROOT_STMT (instance
)
3732 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3733 != vect_internal_def
)))
3735 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3736 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3737 if (dump_enabled_p ())
3738 dump_printf_loc (MSG_NOTE
, vect_location
,
3739 "removing SLP instance operations starting from: %G",
3741 vect_free_slp_instance (instance
);
3742 vinfo
->slp_instances
.ordered_remove (i
);
3743 cost_vec
.release ();
3744 while (!visited_vec
.is_empty ())
3745 visited
.remove (visited_vec
.pop ());
3751 /* For BB vectorization remember the SLP graph entry
3753 if (is_a
<bb_vec_info
> (vinfo
))
3754 instance
->cost_vec
= cost_vec
;
3757 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3758 cost_vec
.release ();
3763 /* Compute vectorizable live stmts. */
3764 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3766 hash_set
<stmt_vec_info
> svisited
;
3767 hash_set
<slp_tree
> visited
;
3768 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3770 vect_location
= instance
->location ();
3771 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3772 instance
, &instance
->cost_vec
, svisited
,
3777 return !vinfo
->slp_instances
.is_empty ();
3780 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3781 closing the eventual chain. */
3784 get_ultimate_leader (slp_instance instance
,
3785 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3787 auto_vec
<slp_instance
*, 8> chain
;
3789 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3791 chain
.safe_push (tem
);
3794 while (!chain
.is_empty ())
3795 *chain
.pop () = instance
;
3799 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3802 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3803 slp_instance instance
, slp_tree node
,
3804 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3805 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
3806 hash_set
<slp_tree
> &visited
)
3808 stmt_vec_info stmt_info
;
3811 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3814 slp_instance
&stmt_instance
3815 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3818 else if (stmt_instance
!= instance
)
3820 /* If we're running into a previously marked stmt make us the
3821 leader of the current ultimate leader. This keeps the
3822 leader chain acyclic and works even when the current instance
3823 connects two previously independent graph parts. */
3824 slp_instance stmt_leader
3825 = get_ultimate_leader (stmt_instance
, instance_leader
);
3826 if (stmt_leader
!= instance
)
3827 instance_leader
.put (stmt_leader
, instance
);
3829 stmt_instance
= instance
;
3832 if (visited
.add (node
))
3836 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3837 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3838 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3839 instance_leader
, visited
);
3842 /* Partition the SLP graph into pieces that can be costed independently. */
3845 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3847 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3849 /* First walk the SLP graph assigning each involved scalar stmt a
3850 corresponding SLP graph entry and upon visiting a previously
3851 marked stmt, make the stmts leader the current SLP graph entry. */
3852 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3853 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3854 hash_set
<slp_tree
> visited
;
3855 slp_instance instance
;
3856 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3858 instance_leader
.put (instance
, instance
);
3859 vect_bb_partition_graph_r (bb_vinfo
,
3860 instance
, SLP_INSTANCE_TREE (instance
),
3861 stmt_to_instance
, instance_leader
,
3865 /* Then collect entries to each independent subgraph. */
3866 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3868 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3869 leader
->subgraph_entries
.safe_push (instance
);
3870 if (dump_enabled_p ()
3871 && leader
!= instance
)
3872 dump_printf_loc (MSG_NOTE
, vect_location
,
3873 "instance %p is leader of %p\n",
3878 /* Compute the scalar cost of the SLP node NODE and its children
3879 and return it. Do not account defs that are marked in LIFE and
3880 update LIFE according to uses of NODE. */
3883 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3884 slp_tree node
, vec
<bool, va_heap
> *life
,
3885 stmt_vector_for_cost
*cost_vec
,
3886 hash_set
<slp_tree
> &visited
)
3889 stmt_vec_info stmt_info
;
3892 if (visited
.add (node
))
3895 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3897 ssa_op_iter op_iter
;
3898 def_operand_p def_p
;
3903 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3904 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3906 /* If there is a non-vectorized use of the defs then the scalar
3907 stmt is kept live in which case we do not account it or any
3908 required defs in the SLP children in the scalar cost. This
3909 way we make the vectorization more costly when compared to
3911 if (!STMT_VINFO_LIVE_P (stmt_info
))
3913 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3915 imm_use_iterator use_iter
;
3917 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3918 if (!is_gimple_debug (use_stmt
))
3920 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
3923 (vect_stmt_to_vectorize (use_stmt_info
)))
3926 BREAK_FROM_IMM_USE_STMT (use_iter
);
3934 /* Count scalar stmts only once. */
3935 if (gimple_visited_p (orig_stmt
))
3937 gimple_set_visited (orig_stmt
, true);
3939 vect_cost_for_stmt kind
;
3940 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
3942 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
3945 kind
= scalar_store
;
3947 else if (vect_nop_conversion_p (orig_stmt_info
))
3951 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
3952 SLP_TREE_VECTYPE (node
), 0, vect_body
);
3955 auto_vec
<bool, 20> subtree_life
;
3956 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3958 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3960 /* Do not directly pass LIFE to the recursive call, copy it to
3961 confine changes in the callee to the current child/subtree. */
3962 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3964 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
3965 for (unsigned j
= 0;
3966 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3968 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3969 if (perm
.first
== i
)
3970 subtree_life
[perm
.second
] = (*life
)[j
];
3975 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3976 subtree_life
.safe_splice (*life
);
3978 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
3980 subtree_life
.truncate (0);
3985 /* Check if vectorization of the basic block is profitable for the
3986 subgraph denoted by SLP_INSTANCES. */
3989 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
3990 vec
<slp_instance
> slp_instances
)
3992 slp_instance instance
;
3994 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3995 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3997 void *vect_target_cost_data
= init_cost (NULL
);
3999 /* Calculate scalar cost and sum the cost for the vector stmts
4000 previously collected. */
4001 stmt_vector_for_cost scalar_costs
;
4002 scalar_costs
.create (0);
4003 hash_set
<slp_tree
> visited
;
4004 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4006 auto_vec
<bool, 20> life
;
4007 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4009 vect_bb_slp_scalar_cost (bb_vinfo
,
4010 SLP_INSTANCE_TREE (instance
),
4011 &life
, &scalar_costs
, visited
);
4012 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
4013 instance
->cost_vec
.release ();
4015 /* Unset visited flag. */
4016 stmt_info_for_cost
*si
;
4017 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
4018 gimple_set_visited (si
->stmt_info
->stmt
, false);
4020 void *scalar_target_cost_data
= init_cost (NULL
);
4021 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
4022 scalar_costs
.release ();
4024 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4025 destroy_cost_data (scalar_target_cost_data
);
4027 /* Complete the target-specific vector cost calculation. */
4028 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4029 &vec_inside_cost
, &vec_epilogue_cost
);
4030 destroy_cost_data (vect_target_cost_data
);
4032 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4034 if (dump_enabled_p ())
4036 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4037 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
4039 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
4040 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
4041 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
4044 /* Vectorization is profitable if its cost is more than the cost of scalar
4045 version. Note that we err on the vector side for equal cost because
4046 the cost estimate is otherwise quite pessimistic (constant uses are
4047 free on the scalar side but cost a load on the vector side for
4049 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4055 /* Find any vectorizable constructors and add them to the grouped_store
4059 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4061 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4062 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4063 !gsi_end_p (gsi
); gsi_next (&gsi
))
4065 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4066 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
4069 tree rhs
= gimple_assign_rhs1 (assign
);
4070 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4071 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4072 CONSTRUCTOR_NELTS (rhs
))
4073 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4074 || uniform_vector_p (rhs
))
4077 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4078 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4082 /* Check if the region described by BB_VINFO can be vectorized, returning
4083 true if so. When returning false, set FATAL to true if the same failure
4084 would prevent vectorization at other vector sizes, false if it is still
4085 worth trying other sizes. N_STMTS is the number of statements in the
4089 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4090 vec
<int> *dataref_groups
)
4092 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4094 slp_instance instance
;
4096 poly_uint64 min_vf
= 2;
4098 /* The first group of checks is independent of the vector size. */
4101 /* Analyze the data references. */
4103 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4105 if (dump_enabled_p ())
4106 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4107 "not vectorized: unhandled data-ref in basic "
4112 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4114 if (dump_enabled_p ())
4115 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4116 "not vectorized: unhandled data access in "
4121 vect_slp_check_for_constructors (bb_vinfo
);
4123 /* If there are no grouped stores and no constructors in the region
4124 there is no need to continue with pattern recog as vect_analyze_slp
4125 will fail anyway. */
4126 if (bb_vinfo
->grouped_stores
.is_empty ())
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4130 "not vectorized: no grouped stores in "
4135 /* While the rest of the analysis below depends on it in some way. */
4138 vect_pattern_recog (bb_vinfo
);
4140 /* Check the SLP opportunities in the basic block, analyze and build SLP
4142 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4144 if (dump_enabled_p ())
4146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4147 "Failed to SLP the basic block.\n");
4148 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4149 "not vectorized: failed to find SLP opportunities "
4150 "in basic block.\n");
4155 /* Optimize permutations. */
4156 vect_optimize_slp (bb_vinfo
);
4158 vect_record_base_alignments (bb_vinfo
);
4160 /* Analyze and verify the alignment of data references and the
4161 dependence in the SLP instances. */
4162 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4164 vect_location
= instance
->location ();
4165 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4166 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4168 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4169 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4170 if (dump_enabled_p ())
4171 dump_printf_loc (MSG_NOTE
, vect_location
,
4172 "removing SLP instance operations starting from: %G",
4174 vect_free_slp_instance (instance
);
4175 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4179 /* Mark all the statements that we want to vectorize as pure SLP and
4181 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4182 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4183 if (SLP_INSTANCE_ROOT_STMT (instance
))
4184 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
4188 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4191 if (!vect_slp_analyze_operations (bb_vinfo
))
4193 if (dump_enabled_p ())
4194 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4195 "not vectorized: bad operation in basic block.\n");
4199 vect_bb_partition_graph (bb_vinfo
);
4204 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4205 basic blocks in BBS, returning true on success.
4206 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4209 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4210 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4212 bb_vec_info bb_vinfo
;
4213 auto_vector_modes vector_modes
;
4215 /* Autodetect first vector size we try. */
4216 machine_mode next_vector_mode
= VOIDmode
;
4217 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4218 unsigned int mode_i
= 0;
4220 vec_info_shared shared
;
4222 machine_mode autodetected_vector_mode
= VOIDmode
;
4225 bool vectorized
= false;
4227 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4229 bool first_time_p
= shared
.datarefs
.is_empty ();
4230 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4232 bb_vinfo
->shared
->save_datarefs ();
4234 bb_vinfo
->shared
->check_datarefs ();
4235 bb_vinfo
->vector_mode
= next_vector_mode
;
4237 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
4238 && dbg_cnt (vect_slp
))
4240 if (dump_enabled_p ())
4242 dump_printf_loc (MSG_NOTE
, vect_location
,
4243 "***** Analysis succeeded with vector mode"
4244 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4245 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4248 bb_vinfo
->shared
->check_datarefs ();
4251 slp_instance instance
;
4252 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4254 if (instance
->subgraph_entries
.is_empty ())
4257 vect_location
= instance
->location ();
4258 if (!unlimited_cost_model (NULL
)
4259 && !vect_bb_vectorization_profitable_p
4260 (bb_vinfo
, instance
->subgraph_entries
))
4262 if (dump_enabled_p ())
4263 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4264 "not vectorized: vectorization is not "
4269 if (!vectorized
&& dump_enabled_p ())
4270 dump_printf_loc (MSG_NOTE
, vect_location
,
4271 "Basic block will be vectorized "
4275 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4277 unsigned HOST_WIDE_INT bytes
;
4278 if (dump_enabled_p ())
4281 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4282 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4283 "basic block part vectorized using %wu "
4284 "byte vectors\n", bytes
);
4286 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4287 "basic block part vectorized using "
4288 "variable length vectors\n");
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE
, vect_location
,
4296 "***** Analysis failed with vector mode %s\n",
4297 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4301 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4304 while (mode_i
< vector_modes
.length ()
4305 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4307 if (dump_enabled_p ())
4308 dump_printf_loc (MSG_NOTE
, vect_location
,
4309 "***** The result for vector mode %s would"
4311 GET_MODE_NAME (vector_modes
[mode_i
]));
4317 if (mode_i
< vector_modes
.length ()
4318 && VECTOR_MODE_P (autodetected_vector_mode
)
4319 && (related_vector_mode (vector_modes
[mode_i
],
4320 GET_MODE_INNER (autodetected_vector_mode
))
4321 == autodetected_vector_mode
)
4322 && (related_vector_mode (autodetected_vector_mode
,
4323 GET_MODE_INNER (vector_modes
[mode_i
]))
4324 == vector_modes
[mode_i
]))
4326 if (dump_enabled_p ())
4327 dump_printf_loc (MSG_NOTE
, vect_location
,
4328 "***** Skipping vector mode %s, which would"
4329 " repeat the analysis for %s\n",
4330 GET_MODE_NAME (vector_modes
[mode_i
]),
4331 GET_MODE_NAME (autodetected_vector_mode
));
4336 || mode_i
== vector_modes
.length ()
4337 || autodetected_vector_mode
== VOIDmode
4338 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4339 vector sizes will fail do not bother iterating. */
4343 /* Try the next biggest vector size. */
4344 next_vector_mode
= vector_modes
[mode_i
++];
4345 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_NOTE
, vect_location
,
4347 "***** Re-trying analysis with vector mode %s\n",
4348 GET_MODE_NAME (next_vector_mode
));
4353 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4354 true if anything in the basic-block was vectorized. */
4357 vect_slp_bbs (vec
<basic_block
> bbs
)
4359 vec
<data_reference_p
> datarefs
= vNULL
;
4360 auto_vec
<int> dataref_groups
;
4362 int current_group
= 0;
4364 for (unsigned i
= 0; i
< bbs
.length (); i
++)
4366 basic_block bb
= bbs
[i
];
4367 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
4370 gimple
*stmt
= gsi_stmt (gsi
);
4371 if (is_gimple_debug (stmt
))
4376 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
4377 vect_location
= stmt
;
4379 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
4380 &dataref_groups
, current_group
))
4385 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
4388 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4389 true if anything in the basic-block was vectorized. */
4392 vect_slp_bb (basic_block bb
)
4394 auto_vec
<basic_block
> bbs
;
4396 return vect_slp_bbs (bbs
);
4399 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4400 true if anything in the basic-block was vectorized. */
4403 vect_slp_function (function
*fun
)
4406 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
4407 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
4409 /* For the moment split the function into pieces to avoid making
4410 the iteration on the vector mode moot. Split at points we know
4411 to not handle well which is CFG merges (SLP discovery doesn't
4412 handle non-loop-header PHIs) and loop exits. Since pattern
4413 recog requires reverse iteration to visit uses before defs
4414 simply chop RPO into pieces. */
4415 auto_vec
<basic_block
> bbs
;
4416 for (unsigned i
= 0; i
< n
; i
++)
4418 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
4421 /* Split when a BB is not dominated by the first block. */
4422 if (!bbs
.is_empty ()
4423 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
4425 if (dump_enabled_p ())
4426 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4427 "splitting region at dominance boundary bb%d\n",
4431 /* Split when the loop determined by the first block
4432 is exited. This is because we eventually insert
4433 invariants at region begin. */
4434 else if (!bbs
.is_empty ()
4435 && bbs
[0]->loop_father
!= bb
->loop_father
4436 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
4438 if (dump_enabled_p ())
4439 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4440 "splitting region at loop %d exit at bb%d\n",
4441 bbs
[0]->loop_father
->num
, bb
->index
);
4445 if (split
&& !bbs
.is_empty ())
4447 r
|= vect_slp_bbs (bbs
);
4449 bbs
.quick_push (bb
);
4454 /* When we have a stmt ending this block and defining a
4455 value we have to insert on edges when inserting after it for
4456 a vector containing its definition. Avoid this for now. */
4457 if (gimple
*last
= last_stmt (bb
))
4458 if (gimple_get_lhs (last
)
4459 && is_ctrl_altering_stmt (last
))
4461 if (dump_enabled_p ())
4462 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4463 "splitting region at control altering "
4464 "definition %G", last
);
4465 r
|= vect_slp_bbs (bbs
);
4470 if (!bbs
.is_empty ())
4471 r
|= vect_slp_bbs (bbs
);
4478 /* Build a variable-length vector in which the elements in ELTS are repeated
4479 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4480 RESULTS and add any new instructions to SEQ.
4482 The approach we use is:
4484 (1) Find a vector mode VM with integer elements of mode IM.
4486 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4487 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4488 from small vectors to IM.
4490 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4492 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4493 correct byte contents.
4495 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4497 We try to find the largest IM for which this sequence works, in order
4498 to cut down on the number of interleaves. */
4501 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
4502 vec
<tree
> elts
, unsigned int nresults
,
4505 unsigned int nelts
= elts
.length ();
4506 tree element_type
= TREE_TYPE (vector_type
);
4508 /* (1) Find a vector mode VM with integer elements of mode IM. */
4509 unsigned int nvectors
= 1;
4510 tree new_vector_type
;
4512 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
4513 &nvectors
, &new_vector_type
,
4517 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4518 unsigned int partial_nelts
= nelts
/ nvectors
;
4519 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
4521 tree_vector_builder partial_elts
;
4522 auto_vec
<tree
, 32> pieces (nvectors
* 2);
4523 pieces
.quick_grow (nvectors
* 2);
4524 for (unsigned int i
= 0; i
< nvectors
; ++i
)
4526 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4527 ELTS' has mode IM. */
4528 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
4529 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
4530 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
4531 tree t
= gimple_build_vector (seq
, &partial_elts
);
4532 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
4533 TREE_TYPE (new_vector_type
), t
);
4535 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4536 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
4539 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4540 correct byte contents.
4542 We need to repeat the following operation log2(nvectors) times:
4544 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4545 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4547 However, if each input repeats every N elements and the VF is
4548 a multiple of N * 2, the HI result is the same as the LO. */
4549 unsigned int in_start
= 0;
4550 unsigned int out_start
= nvectors
;
4551 unsigned int hi_start
= nvectors
/ 2;
4552 /* A bound on the number of outputs needed to produce NRESULTS results
4553 in the final iteration. */
4554 unsigned int noutputs_bound
= nvectors
* nresults
;
4555 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
4557 noutputs_bound
/= 2;
4558 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
4559 for (unsigned int i
= 0; i
< limit
; ++i
)
4562 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
4565 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
4569 tree output
= make_ssa_name (new_vector_type
);
4570 tree input1
= pieces
[in_start
+ (i
/ 2)];
4571 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
4572 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
4575 gimple_seq_add_stmt (seq
, stmt
);
4576 pieces
[out_start
+ i
] = output
;
4578 std::swap (in_start
, out_start
);
4581 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4582 results
.reserve (nresults
);
4583 for (unsigned int i
= 0; i
< nresults
; ++i
)
4585 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
4586 pieces
[in_start
+ i
]));
4588 results
.quick_push (results
[i
- nvectors
]);
4592 /* For constant and loop invariant defs in OP_NODE this function creates
4593 vector defs that will be used in the vectorized stmts and stores them
4594 to SLP_TREE_VEC_DEFS of OP_NODE. */
4597 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
4599 unsigned HOST_WIDE_INT nunits
;
4601 unsigned j
, number_of_places_left_in_vector
;
4604 int group_size
= op_node
->ops
.length ();
4605 unsigned int vec_num
, i
;
4606 unsigned number_of_copies
= 1;
4608 gimple_seq ctor_seq
= NULL
;
4609 auto_vec
<tree
, 16> permute_results
;
4611 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4612 vector_type
= SLP_TREE_VECTYPE (op_node
);
4614 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
4615 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
4616 auto_vec
<tree
> voprnds (number_of_vectors
);
4618 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4619 created vectors. It is greater than 1 if unrolling is performed.
4621 For example, we have two scalar operands, s1 and s2 (e.g., group of
4622 strided accesses of size two), while NUNITS is four (i.e., four scalars
4623 of this type can be packed in a vector). The output vector will contain
4624 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4627 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4628 containing the operands.
4630 For example, NUNITS is four as before, and the group size is 8
4631 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4632 {s5, s6, s7, s8}. */
4634 /* When using duplicate_and_interleave, we just need one element for
4635 each scalar statement. */
4636 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4637 nunits
= group_size
;
4639 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4641 number_of_places_left_in_vector
= nunits
;
4643 tree_vector_builder
elts (vector_type
, nunits
, 1);
4644 elts
.quick_grow (nunits
);
4645 stmt_vec_info insert_after
= NULL
;
4646 for (j
= 0; j
< number_of_copies
; j
++)
4649 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
4651 /* Create 'vect_ = {op0,op1,...,opn}'. */
4652 number_of_places_left_in_vector
--;
4654 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
4656 if (CONSTANT_CLASS_P (op
))
4658 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4660 /* Can't use VIEW_CONVERT_EXPR for booleans because
4661 of possibly different sizes of scalar value and
4663 if (integer_zerop (op
))
4664 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
4665 else if (integer_onep (op
))
4666 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
4671 op
= fold_unary (VIEW_CONVERT_EXPR
,
4672 TREE_TYPE (vector_type
), op
);
4673 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
4677 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
4679 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4682 = build_all_ones_cst (TREE_TYPE (vector_type
));
4684 = build_zero_cst (TREE_TYPE (vector_type
));
4685 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
4686 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4692 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4695 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4698 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4702 elts
[number_of_places_left_in_vector
] = op
;
4703 if (!CONSTANT_CLASS_P (op
))
4705 /* For BB vectorization we have to compute an insert location
4706 when a def is inside the analyzed region since we cannot
4707 simply insert at the BB start in this case. */
4708 stmt_vec_info opdef
;
4709 if (TREE_CODE (orig_op
) == SSA_NAME
4710 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4711 && is_a
<bb_vec_info
> (vinfo
)
4712 && (opdef
= vinfo
->lookup_def (orig_op
)))
4715 insert_after
= opdef
;
4717 insert_after
= get_later_stmt (insert_after
, opdef
);
4720 if (number_of_places_left_in_vector
== 0)
4723 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4724 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4725 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
4728 if (permute_results
.is_empty ())
4729 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
4730 elts
, number_of_vectors
,
4732 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
4734 if (!gimple_seq_empty_p (ctor_seq
))
4738 gimple_stmt_iterator gsi
;
4739 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
4741 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
4742 gsi_insert_seq_before (&gsi
, ctor_seq
,
4743 GSI_CONTINUE_LINKING
);
4745 else if (!stmt_ends_bb_p (insert_after
->stmt
))
4747 gsi
= gsi_for_stmt (insert_after
->stmt
);
4748 gsi_insert_seq_after (&gsi
, ctor_seq
,
4749 GSI_CONTINUE_LINKING
);
4753 /* When we want to insert after a def where the
4754 defining stmt throws then insert on the fallthru
4756 edge e
= find_fallthru_edge
4757 (gimple_bb (insert_after
->stmt
)->succs
);
4759 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
4760 gcc_assert (!new_bb
);
4764 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
4767 voprnds
.quick_push (vec_cst
);
4768 insert_after
= NULL
;
4769 number_of_places_left_in_vector
= nunits
;
4771 elts
.new_vector (vector_type
, nunits
, 1);
4772 elts
.quick_grow (nunits
);
4777 /* Since the vectors are created in the reverse order, we should invert
4779 vec_num
= voprnds
.length ();
4780 for (j
= vec_num
; j
!= 0; j
--)
4782 vop
= voprnds
[j
- 1];
4783 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4786 /* In case that VF is greater than the unrolling factor needed for the SLP
4787 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4788 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4789 to replicate the vectors. */
4790 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
4791 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
4793 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4796 /* Get the Ith vectorized definition from SLP_NODE. */
4799 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
4801 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
4802 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
4804 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
4807 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4810 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
4812 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
4813 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
4816 gimple
*vec_def_stmt
;
4817 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
4818 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
4821 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
4824 /* Get N vectorized definitions for SLP_NODE. */
4827 vect_get_slp_defs (vec_info
*,
4828 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
4831 n
= SLP_TREE_CHILDREN (slp_node
).length ();
4833 for (unsigned i
= 0; i
< n
; ++i
)
4835 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
4836 vec
<tree
> vec_defs
= vNULL
;
4837 vect_get_slp_defs (child
, &vec_defs
);
4838 vec_oprnds
->quick_push (vec_defs
);
4842 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4843 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4844 permute statements for the SLP node NODE. Store the number of vector
4845 permute instructions in *N_PERMS and the number of vector load
4846 instructions in *N_LOADS. */
4849 vect_transform_slp_perm_load (vec_info
*vinfo
,
4850 slp_tree node
, vec
<tree
> dr_chain
,
4851 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
4852 bool analyze_only
, unsigned *n_perms
,
4853 unsigned int *n_loads
)
4855 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4857 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4858 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
4859 unsigned int mask_element
;
4862 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4865 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4867 mode
= TYPE_MODE (vectype
);
4868 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4870 /* Initialize the vect stmts of NODE to properly insert the generated
4873 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
4874 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
4875 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
4877 /* Generate permutation masks for every NODE. Number of masks for each NODE
4878 is equal to GROUP_SIZE.
4879 E.g., we have a group of three nodes with three loads from the same
4880 location in each node, and the vector size is 4. I.e., we have a
4881 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4882 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4883 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4886 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4887 The last mask is illegal since we assume two operands for permute
4888 operation, and the mask element values can't be outside that range.
4889 Hence, the last mask must be converted into {2,5,5,5}.
4890 For the first two permutations we need the first and the second input
4891 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4892 we need the second and the third vectors: {b1,c1,a2,b2} and
4895 int vect_stmts_counter
= 0;
4896 unsigned int index
= 0;
4897 int first_vec_index
= -1;
4898 int second_vec_index
= -1;
4902 vec_perm_builder mask
;
4903 unsigned int nelts_to_build
;
4904 unsigned int nvectors_per_build
;
4905 unsigned int in_nlanes
;
4906 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
4907 && multiple_p (nunits
, group_size
));
4910 /* A single vector contains a whole number of copies of the node, so:
4911 (a) all permutes can use the same mask; and
4912 (b) the permutes only need a single vector input. */
4913 mask
.new_vector (nunits
, group_size
, 3);
4914 nelts_to_build
= mask
.encoded_nelts ();
4915 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
4916 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
4920 /* We need to construct a separate mask for each vector statement. */
4921 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
4922 if (!nunits
.is_constant (&const_nunits
)
4923 || !vf
.is_constant (&const_vf
))
4925 mask
.new_vector (const_nunits
, const_nunits
, 1);
4926 nelts_to_build
= const_vf
* group_size
;
4927 nvectors_per_build
= 1;
4928 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
4930 auto_sbitmap
used_in_lanes (in_nlanes
);
4931 bitmap_clear (used_in_lanes
);
4933 unsigned int count
= mask
.encoded_nelts ();
4934 mask
.quick_grow (count
);
4935 vec_perm_indices indices
;
4937 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
4939 unsigned int iter_num
= j
/ group_size
;
4940 unsigned int stmt_num
= j
% group_size
;
4941 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
4942 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
4943 bitmap_set_bit (used_in_lanes
, i
);
4946 first_vec_index
= 0;
4951 /* Enforced before the loop when !repeating_p. */
4952 unsigned int const_nunits
= nunits
.to_constant ();
4953 vec_index
= i
/ const_nunits
;
4954 mask_element
= i
% const_nunits
;
4955 if (vec_index
== first_vec_index
4956 || first_vec_index
== -1)
4958 first_vec_index
= vec_index
;
4960 else if (vec_index
== second_vec_index
4961 || second_vec_index
== -1)
4963 second_vec_index
= vec_index
;
4964 mask_element
+= const_nunits
;
4968 if (dump_enabled_p ())
4969 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4970 "permutation requires at "
4971 "least three vectors %G",
4973 gcc_assert (analyze_only
);
4977 gcc_assert (mask_element
< 2 * const_nunits
);
4980 if (mask_element
!= index
)
4982 mask
[index
++] = mask_element
;
4984 if (index
== count
&& !noop_p
)
4986 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
4987 if (!can_vec_perm_const_p (mode
, indices
))
4989 if (dump_enabled_p ())
4991 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4993 "unsupported vect permute { ");
4994 for (i
= 0; i
< count
; ++i
)
4996 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4997 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4999 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5001 gcc_assert (analyze_only
);
5012 tree mask_vec
= NULL_TREE
;
5015 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5017 if (second_vec_index
== -1)
5018 second_vec_index
= first_vec_index
;
5020 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5022 /* Generate the permute statement if necessary. */
5023 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5024 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5028 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5030 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5032 perm_dest
= make_ssa_name (perm_dest
);
5034 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5035 first_vec
, second_vec
,
5037 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5041 /* If mask was NULL_TREE generate the requested
5042 identity transform. */
5043 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5045 /* Store the vector statement in NODE. */
5046 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5051 first_vec_index
= -1;
5052 second_vec_index
= -1;
5060 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5063 /* Enforced above when !repeating_p. */
5064 unsigned int const_nunits
= nunits
.to_constant ();
5066 bool load_seen
= false;
5067 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5069 if (i
% const_nunits
== 0)
5075 if (bitmap_bit_p (used_in_lanes
, i
))
5087 /* Vectorize the SLP permutations in NODE as specified
5088 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5089 child number and lane number.
5090 Interleaving of two two-lane two-child SLP subtrees (not supported):
5091 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5092 A blend of two four-lane two-child SLP subtrees:
5093 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5094 Highpart of a four-lane one-child SLP subtree (not supported):
5095 [ { 0, 2 }, { 0, 3 } ]
5096 Where currently only a subset is supported by code generating below. */
5099 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5100 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5102 tree vectype
= SLP_TREE_VECTYPE (node
);
5104 /* ??? We currently only support all same vector input and output types
5105 while the SLP IL should really do a concat + select and thus accept
5106 arbitrary mismatches. */
5109 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5110 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5112 if (dump_enabled_p ())
5113 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5114 "Unsupported lane permutation\n");
5118 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5119 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5120 if (dump_enabled_p ())
5122 dump_printf_loc (MSG_NOTE
, vect_location
,
5123 "vectorizing permutation");
5124 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5125 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5126 dump_printf (MSG_NOTE
, "\n");
5129 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5130 if (!nunits
.is_constant ())
5132 unsigned HOST_WIDE_INT vf
= 1;
5133 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5134 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5136 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5137 gcc_assert (multiple_p (olanes
, nunits
));
5139 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5140 from the { SLP operand, scalar lane } permutation as recorded in the
5141 SLP node as intermediate step. This part should already work
5142 with SLP children with arbitrary number of lanes. */
5143 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5144 auto_vec
<unsigned> active_lane
;
5145 vperm
.create (olanes
);
5146 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5147 for (unsigned i
= 0; i
< vf
; ++i
)
5149 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5151 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5152 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5153 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5154 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5155 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5156 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5158 /* Advance to the next group. */
5159 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5160 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5163 if (dump_enabled_p ())
5165 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5166 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5168 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5169 dump_printf (MSG_NOTE
, ",");
5170 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5171 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5172 vperm
[i
].first
.second
);
5174 dump_printf (MSG_NOTE
, "\n");
5177 /* We can only handle two-vector permutes, everything else should
5178 be lowered on the SLP level. The following is closely inspired
5179 by vect_transform_slp_perm_load and is supposed to eventually
5181 ??? As intermediate step do code-gen in the SLP tree representation
5183 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5184 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5185 unsigned int const_nunits
= nunits
.to_constant ();
5186 unsigned int index
= 0;
5187 unsigned int mask_element
;
5188 vec_perm_builder mask
;
5189 mask
.new_vector (const_nunits
, const_nunits
, 1);
5190 unsigned int count
= mask
.encoded_nelts ();
5191 mask
.quick_grow (count
);
5192 vec_perm_indices indices
;
5193 unsigned nperms
= 0;
5194 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5196 mask_element
= vperm
[i
].second
;
5197 if (first_vec
.first
== -1U
5198 || first_vec
== vperm
[i
].first
)
5199 first_vec
= vperm
[i
].first
;
5200 else if (second_vec
.first
== -1U
5201 || second_vec
== vperm
[i
].first
)
5203 second_vec
= vperm
[i
].first
;
5204 mask_element
+= const_nunits
;
5208 if (dump_enabled_p ())
5209 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5210 "permutation requires at "
5211 "least three vectors");
5216 mask
[index
++] = mask_element
;
5220 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5222 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5224 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5226 if (dump_enabled_p ())
5228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5230 "unsupported vect permute { ");
5231 for (i
= 0; i
< count
; ++i
)
5233 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5234 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5236 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5246 if (second_vec
.first
== -1U)
5247 second_vec
= first_vec
;
5249 /* Generate the permute statement if necessary. */
5250 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5252 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5254 tree perm_dest
= make_ssa_name (vectype
);
5257 slp_tree second_node
5258 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5260 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5261 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5262 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5263 first_def
, second_def
,
5267 /* We need a copy here in case the def was external. */
5268 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5269 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5270 /* Store the vector statement in NODE. */
5271 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5275 first_vec
= std::make_pair (-1U, -1U);
5276 second_vec
= std::make_pair (-1U, -1U);
5281 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5286 /* Vectorize SLP NODE. */
5289 vect_schedule_slp_node (vec_info
*vinfo
,
5290 slp_tree node
, slp_instance instance
)
5292 gimple_stmt_iterator si
;
5296 /* For existing vectors there's nothing to do. */
5297 if (SLP_TREE_VEC_DEFS (node
).exists ())
5300 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5302 /* Vectorize externals and constants. */
5303 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5304 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
5306 /* ??? vectorizable_shift can end up using a scalar operand which is
5307 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5308 node in this case. */
5309 if (!SLP_TREE_VECTYPE (node
))
5312 vect_create_constant_vectors (vinfo
, node
);
5316 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5318 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
5319 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
5321 if (dump_enabled_p ())
5322 dump_printf_loc (MSG_NOTE
, vect_location
,
5323 "------>vectorizing SLP node starting from: %G",
5326 if (STMT_VINFO_DATA_REF (stmt_info
)
5327 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5329 /* Vectorized loads go before the first scalar load to make it
5330 ready early, vectorized stores go before the last scalar
5331 stmt which is where all uses are ready. */
5332 stmt_vec_info last_stmt_info
= NULL
;
5333 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
5334 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
5335 else /* DR_IS_WRITE */
5336 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
5337 si
= gsi_for_stmt (last_stmt_info
->stmt
);
5339 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
5340 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
5341 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
5342 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5344 /* For PHI node vectorization we do not use the insertion iterator. */
5349 /* Emit other stmts after the children vectorized defs which is
5350 earliest possible. */
5351 gimple
*last_stmt
= NULL
;
5352 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5353 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5355 /* For fold-left reductions we are retaining the scalar
5356 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5357 set so the representation isn't perfect. Resort to the
5358 last scalar def here. */
5359 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
5361 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
5362 == cycle_phi_info_type
);
5363 gphi
*phi
= as_a
<gphi
*>
5364 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
5366 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
5369 /* We are emitting all vectorized stmts in the same place and
5370 the last one is the last.
5371 ??? Unless we have a load permutation applied and that
5372 figures to re-use an earlier generated load. */
5375 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
5377 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5380 else if (!SLP_TREE_VECTYPE (child
))
5382 /* For externals we use unvectorized at all scalar defs. */
5385 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
5386 if (TREE_CODE (def
) == SSA_NAME
5387 && !SSA_NAME_IS_DEFAULT_DEF (def
))
5389 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
5391 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
5397 /* For externals we have to look at all defs since their
5398 insertion place is decided per vector. But beware
5399 of pre-existing vectors where we need to make sure
5400 we do not insert before the region boundary. */
5401 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
5402 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
5403 last_stmt
= gsi_stmt (gsi_after_labels
5404 (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]));
5409 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
5410 if (TREE_CODE (vdef
) == SSA_NAME
5411 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
5413 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
5415 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5420 /* This can happen when all children are pre-existing vectors or
5423 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
5424 if (is_a
<gphi
*> (last_stmt
))
5425 si
= gsi_after_labels (gimple_bb (last_stmt
));
5428 si
= gsi_for_stmt (last_stmt
);
5433 bool done_p
= false;
5435 /* Handle purely internal nodes. */
5436 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5438 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5439 be shared with different SLP nodes (but usually it's the same
5440 operation apart from the case the stmt is only there for denoting
5441 the actual scalar lane defs ...). So do not call vect_transform_stmt
5442 but open-code it here (partly). */
5443 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
5448 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
5451 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5452 For loop vectorization this is done in vectorizable_call, but for SLP
5453 it needs to be deferred until end of vect_schedule_slp, because multiple
5454 SLP instances may refer to the same scalar stmt. */
5457 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
5458 slp_tree node
, hash_set
<slp_tree
> &visited
)
5461 gimple_stmt_iterator gsi
;
5465 stmt_vec_info stmt_info
;
5467 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5470 if (visited
.add (node
))
5473 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5474 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
5476 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5478 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
5479 if (!stmt
|| gimple_bb (stmt
) == NULL
)
5481 if (is_pattern_stmt_p (stmt_info
)
5482 || !PURE_SLP_STMT (stmt_info
))
5484 lhs
= gimple_call_lhs (stmt
);
5485 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
5486 gsi
= gsi_for_stmt (stmt
);
5487 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
5488 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
5493 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
5495 hash_set
<slp_tree
> visited
;
5496 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
5499 /* Vectorize the instance root. */
5502 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
5504 gassign
*rstmt
= NULL
;
5506 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
5511 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5513 tree vect_lhs
= gimple_get_lhs (child_stmt
);
5514 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5515 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
5516 TREE_TYPE (vect_lhs
)))
5517 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
5519 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
5523 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
5525 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5528 vec
<constructor_elt
, va_gc
> *v
;
5529 vec_alloc (v
, nelts
);
5531 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5533 CONSTRUCTOR_APPEND_ELT (v
,
5535 gimple_get_lhs (child_stmt
));
5537 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5538 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
5539 tree r_constructor
= build_constructor (rtype
, v
);
5540 rstmt
= gimple_build_assign (lhs
, r_constructor
);
5545 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
5546 gsi_replace (&rgsi
, rstmt
, true);
5556 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5559 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
5560 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
5561 int &maxdfs
, vec
<slp_tree
> &stack
)
5564 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
5565 gcc_assert (!existed_p
);
5567 info
->lowlink
= maxdfs
;
5571 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5573 info
->on_stack
= false;
5574 vect_schedule_slp_node (vinfo
, node
, instance
);
5578 info
->on_stack
= true;
5579 stack
.safe_push (node
);
5584 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5588 slp_scc_info
*child_info
= scc_info
.get (child
);
5591 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
5592 /* Recursion might have re-allocated the node. */
5593 info
= scc_info
.get (node
);
5594 child_info
= scc_info
.get (child
);
5595 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
5597 else if (child_info
->on_stack
)
5598 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
5600 if (info
->lowlink
!= info
->dfs
)
5603 auto_vec
<slp_tree
, 4> phis_to_fixup
;
5606 if (stack
.last () == node
)
5609 info
->on_stack
= false;
5610 vect_schedule_slp_node (vinfo
, node
, instance
);
5611 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
5612 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
5613 phis_to_fixup
.quick_push (node
);
5618 int last_idx
= stack
.length () - 1;
5619 while (stack
[last_idx
] != node
)
5621 /* We can break the cycle at PHIs who have at least one child
5622 code generated. Then we could re-start the DFS walk until
5623 all nodes in the SCC are covered (we might have new entries
5624 for only back-reachable nodes). But it's simpler to just
5625 iterate and schedule those that are ready. */
5626 unsigned todo
= stack
.length () - last_idx
;
5629 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
5631 slp_tree entry
= stack
[idx
];
5634 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
5635 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
5637 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
5644 else if (scc_info
.get (child
)->on_stack
)
5662 vect_schedule_slp_node (vinfo
, entry
, instance
);
5663 scc_info
.get (entry
)->on_stack
= false;
5667 phis_to_fixup
.safe_push (entry
);
5674 stack
.truncate (last_idx
);
5677 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5679 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
5681 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
5684 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
5686 unsigned dest_idx
= e
->dest_idx
;
5687 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
5688 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
5690 /* Simply fill all args. */
5691 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
5692 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
5693 vect_get_slp_vect_def (child
, i
),
5694 e
, gimple_phi_arg_location (phi
, dest_idx
));
5699 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5702 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
5704 slp_instance instance
;
5707 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
5709 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5711 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5712 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_NOTE
, vect_location
,
5715 "Vectorizing SLP tree:\n");
5716 if (SLP_INSTANCE_ROOT_STMT (instance
))
5717 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
5718 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
5719 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5720 SLP_INSTANCE_TREE (instance
));
5722 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5723 have a PHI be the node breaking the cycle. */
5724 auto_vec
<slp_tree
> stack
;
5725 if (!scc_info
.get (node
))
5726 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
5728 if (SLP_INSTANCE_ROOT_STMT (instance
))
5729 vectorize_slp_instance_root_stmt (node
, instance
);
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE
, vect_location
,
5733 "vectorizing stmts using SLP.\n");
5736 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5738 slp_tree root
= SLP_INSTANCE_TREE (instance
);
5739 stmt_vec_info store_info
;
5742 /* Remove scalar call stmts. Do not do this for basic-block
5743 vectorization as not all uses may be vectorized.
5744 ??? Why should this be necessary? DCE should be able to
5745 remove the stmts itself.
5746 ??? For BB vectorization we can as well remove scalar
5747 stmts starting from the SLP tree root if they have no
5749 if (is_a
<loop_vec_info
> (vinfo
))
5750 vect_remove_slp_scalar_calls (vinfo
, root
);
5752 /* Remove vectorized stores original scalar stmts. */
5753 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
5755 if (!STMT_VINFO_DATA_REF (store_info
)
5756 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
5759 store_info
= vect_orig_stmt (store_info
);
5760 /* Free the attached stmt_vec_info and remove the stmt. */
5761 vinfo
->remove_stmt (store_info
);