1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
52 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
53 slp_tree
, stmt_vector_for_cost
*);
55 object_allocator
<_slp_tree
> *slp_tree_pool
;
58 _slp_tree::operator new (size_t n
)
60 gcc_assert (n
== sizeof (_slp_tree
));
61 return slp_tree_pool
->allocate_raw ();
65 _slp_tree::operator delete (void *node
, size_t n
)
67 gcc_assert (n
== sizeof (_slp_tree
));
68 slp_tree_pool
->remove_raw (node
);
72 /* Initialize a SLP node. */
74 _slp_tree::_slp_tree ()
76 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
77 SLP_TREE_SCALAR_OPS (this) = vNULL
;
78 SLP_TREE_VEC_STMTS (this) = vNULL
;
79 SLP_TREE_VEC_DEFS (this) = vNULL
;
80 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
81 SLP_TREE_CHILDREN (this) = vNULL
;
82 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
83 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
84 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
85 SLP_TREE_CODE (this) = ERROR_MARK
;
86 SLP_TREE_VECTYPE (this) = NULL_TREE
;
87 SLP_TREE_REPRESENTATIVE (this) = NULL
;
88 SLP_TREE_REF_COUNT (this) = 1;
93 /* Tear down a SLP node. */
95 _slp_tree::~_slp_tree ()
97 SLP_TREE_CHILDREN (this).release ();
98 SLP_TREE_SCALAR_STMTS (this).release ();
99 SLP_TREE_SCALAR_OPS (this).release ();
100 SLP_TREE_VEC_STMTS (this).release ();
101 SLP_TREE_VEC_DEFS (this).release ();
102 SLP_TREE_LOAD_PERMUTATION (this).release ();
103 SLP_TREE_LANE_PERMUTATION (this).release ();
106 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
109 vect_free_slp_tree (slp_tree node
)
114 if (--SLP_TREE_REF_COUNT (node
) != 0)
117 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
119 vect_free_slp_tree (child
);
124 /* Return a location suitable for dumpings related to the SLP instance. */
127 _slp_instance::location () const
130 return root_stmt
->stmt
;
132 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
136 /* Free the memory allocated for the SLP instance. */
139 vect_free_slp_instance (slp_instance instance
)
141 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
142 SLP_INSTANCE_LOADS (instance
).release ();
143 instance
->subgraph_entries
.release ();
144 instance
->cost_vec
.release ();
149 /* Create an SLP node for SCALAR_STMTS. */
152 vect_create_new_slp_node (slp_tree node
,
153 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
155 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
156 SLP_TREE_CHILDREN (node
).create (nops
);
157 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
158 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
159 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
163 /* Create an SLP node for SCALAR_STMTS. */
166 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
168 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
171 /* Create an SLP node for OPS. */
174 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
176 SLP_TREE_SCALAR_OPS (node
) = ops
;
177 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
178 SLP_TREE_LANES (node
) = ops
.length ();
182 /* Create an SLP node for OPS. */
185 vect_create_new_slp_node (vec
<tree
> ops
)
187 return vect_create_new_slp_node (new _slp_tree
, ops
);
191 /* This structure is used in creation of an SLP tree. Each instance
192 corresponds to the same operand in a group of scalar stmts in an SLP
194 typedef struct _slp_oprnd_info
196 /* Def-stmts for the operands. */
197 vec
<stmt_vec_info
> def_stmts
;
200 /* Information about the first statement, its vector def-type, type, the
201 operand itself in case it's constant, and an indication if it's a pattern
204 enum vect_def_type first_dt
;
209 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
211 static vec
<slp_oprnd_info
>
212 vect_create_oprnd_info (int nops
, int group_size
)
215 slp_oprnd_info oprnd_info
;
216 vec
<slp_oprnd_info
> oprnds_info
;
218 oprnds_info
.create (nops
);
219 for (i
= 0; i
< nops
; i
++)
221 oprnd_info
= XNEW (struct _slp_oprnd_info
);
222 oprnd_info
->def_stmts
.create (group_size
);
223 oprnd_info
->ops
.create (group_size
);
224 oprnd_info
->first_dt
= vect_uninitialized_def
;
225 oprnd_info
->first_op_type
= NULL_TREE
;
226 oprnd_info
->any_pattern
= false;
227 oprnds_info
.quick_push (oprnd_info
);
234 /* Free operands info. */
237 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
240 slp_oprnd_info oprnd_info
;
242 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
244 oprnd_info
->def_stmts
.release ();
245 oprnd_info
->ops
.release ();
246 XDELETE (oprnd_info
);
249 oprnds_info
.release ();
253 /* Return true if STMTS contains a pattern statement. */
256 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
258 stmt_vec_info stmt_info
;
260 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
261 if (is_pattern_stmt_p (stmt_info
))
266 /* Return true when all lanes in the external or constant NODE have
270 vect_slp_tree_uniform_p (slp_tree node
)
272 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
273 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
275 /* Pre-exsting vectors. */
276 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
280 tree op
, first
= NULL_TREE
;
281 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
284 else if (!operand_equal_p (first
, op
, 0))
290 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
291 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
295 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
296 stmt_vec_info first_stmt_info
)
298 stmt_vec_info next_stmt_info
= first_stmt_info
;
301 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
306 if (next_stmt_info
== stmt_info
)
308 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
310 result
+= DR_GROUP_GAP (next_stmt_info
);
312 while (next_stmt_info
);
317 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
318 using the method implemented by duplicate_and_interleave. Return true
319 if so, returning the number of intermediate vectors in *NVECTORS_OUT
320 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
324 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
325 tree elt_type
, unsigned int *nvectors_out
,
326 tree
*vector_type_out
,
329 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
330 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
333 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
334 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
335 unsigned int nvectors
= 1;
338 scalar_int_mode int_mode
;
339 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
340 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
342 /* Get the natural vector type for this SLP group size. */
343 tree int_type
= build_nonstandard_integer_type
344 (GET_MODE_BITSIZE (int_mode
), 1);
346 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
348 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
349 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
350 GET_MODE_SIZE (base_vector_mode
)))
352 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
353 together into elements of type INT_TYPE and using the result
354 to build NVECTORS vectors. */
355 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
356 vec_perm_builder
sel1 (nelts
, 2, 3);
357 vec_perm_builder
sel2 (nelts
, 2, 3);
358 poly_int64 half_nelts
= exact_div (nelts
, 2);
359 for (unsigned int i
= 0; i
< 3; ++i
)
362 sel1
.quick_push (i
+ nelts
);
363 sel2
.quick_push (half_nelts
+ i
);
364 sel2
.quick_push (half_nelts
+ i
+ nelts
);
366 vec_perm_indices
indices1 (sel1
, 2, nelts
);
367 vec_perm_indices
indices2 (sel2
, 2, nelts
);
368 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
369 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
372 *nvectors_out
= nvectors
;
374 *vector_type_out
= vector_type
;
377 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
379 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
386 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
392 /* Return true if DTA and DTB match. */
395 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
398 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
399 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
402 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
403 they are of a valid type and that they match the defs of the first stmt of
404 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
405 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
406 indicates swap is required for cond_expr stmts. Specifically, *SWAP
407 is 1 if STMT is cond and operands of comparison need to be swapped;
408 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
409 If there is any operand swap in this function, *SWAP is set to non-zero
411 If there was a fatal error return -1; if the error could be corrected by
412 swapping operands of father node of this one, return 1; if everything is
415 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
417 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
418 vec
<slp_oprnd_info
> *oprnds_info
)
420 stmt_vec_info stmt_info
= stmts
[stmt_num
];
422 unsigned int i
, number_of_oprnds
;
423 enum vect_def_type dt
= vect_uninitialized_def
;
424 slp_oprnd_info oprnd_info
;
425 int first_op_idx
= 1;
426 unsigned int commutative_op
= -1U;
427 bool first_op_cond
= false;
428 bool first
= stmt_num
== 0;
430 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
432 number_of_oprnds
= gimple_call_num_args (stmt
);
434 if (gimple_call_internal_p (stmt
))
436 internal_fn ifn
= gimple_call_internal_fn (stmt
);
437 commutative_op
= first_commutative_argument (ifn
);
439 /* Masked load, only look at mask. */
440 if (ifn
== IFN_MASK_LOAD
)
442 number_of_oprnds
= 1;
443 /* Mask operand index. */
448 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
450 enum tree_code code
= gimple_assign_rhs_code (stmt
);
451 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
452 /* Swap can only be done for cond_expr if asked to, otherwise we
453 could result in different comparison code to the first stmt. */
454 if (code
== COND_EXPR
455 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
457 first_op_cond
= true;
461 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
463 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
464 number_of_oprnds
= gimple_phi_num_args (stmt
);
468 bool swapped
= (swap
!= 0);
469 bool backedge
= false;
470 gcc_assert (!swapped
|| first_op_cond
);
471 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
472 for (i
= 0; i
< number_of_oprnds
; i
++)
476 /* Map indicating how operands of cond_expr should be swapped. */
477 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
478 int *map
= maps
[swap
];
481 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
482 first_op_idx
), map
[i
]);
484 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
486 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
488 oprnd
= gimple_phi_arg_def (stmt
, i
);
489 backedge
= dominated_by_p (CDI_DOMINATORS
,
490 gimple_phi_arg_edge (stmt
, i
)->src
,
491 gimple_bb (stmt_info
->stmt
));
494 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
495 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
496 oprnd
= TREE_OPERAND (oprnd
, 0);
498 oprnd_info
= (*oprnds_info
)[i
];
500 stmt_vec_info def_stmt_info
;
501 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
505 "Build SLP failed: can't analyze def for %T\n",
513 oprnd_info
->def_stmts
.quick_push (NULL
);
514 oprnd_info
->ops
.quick_push (NULL_TREE
);
515 oprnd_info
->first_dt
= vect_uninitialized_def
;
519 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
520 oprnd_info
->any_pattern
= true;
522 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
523 oprnd_info
->ops
.quick_push (oprnd
);
525 /* If there's a extern def on a backedge make sure we can
526 code-generate at the region start.
527 ??? This is another case that could be fixed by adjusting
528 how we split the function but at the moment we'd have conflicting
531 && dts
[i
] == vect_external_def
532 && is_a
<bb_vec_info
> (vinfo
)
533 && TREE_CODE (oprnd
) == SSA_NAME
534 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
535 && !dominated_by_p (CDI_DOMINATORS
,
536 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
537 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
539 if (dump_enabled_p ())
540 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
541 "Build SLP failed: extern def %T only defined "
542 "on backedge\n", oprnd
);
548 tree type
= TREE_TYPE (oprnd
);
550 if ((dt
== vect_constant_def
551 || dt
== vect_external_def
)
552 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
553 && (TREE_CODE (type
) == BOOLEAN_TYPE
554 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
559 "Build SLP failed: invalid type of def "
560 "for variable-length SLP %T\n", oprnd
);
564 /* For the swapping logic below force vect_reduction_def
565 for the reduction op in a SLP reduction group. */
566 if (!STMT_VINFO_DATA_REF (stmt_info
)
567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
568 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
570 dts
[i
] = dt
= vect_reduction_def
;
572 /* Check the types of the definition. */
575 case vect_external_def
:
576 case vect_constant_def
:
577 case vect_internal_def
:
578 case vect_reduction_def
:
579 case vect_induction_def
:
580 case vect_nested_cycle
:
584 /* FORNOW: Not supported. */
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
587 "Build SLP failed: illegal type of def %T\n",
592 oprnd_info
->first_dt
= dt
;
593 oprnd_info
->first_op_type
= type
;
599 /* Now match the operand definition types to that of the first stmt. */
600 for (i
= 0; i
< number_of_oprnds
;)
608 oprnd_info
= (*oprnds_info
)[i
];
610 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
611 oprnd
= oprnd_info
->ops
[stmt_num
];
612 tree type
= TREE_TYPE (oprnd
);
614 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
616 if (dump_enabled_p ())
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
618 "Build SLP failed: different operand types\n");
622 /* Not first stmt of the group, check that the def-stmt/s match
623 the def-stmt/s of the first stmt. Allow different definition
624 types for reduction chains: the first stmt must be a
625 vect_reduction_def (a phi node), and the rest
626 end in the reduction chain. */
627 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
628 && !(oprnd_info
->first_dt
== vect_reduction_def
629 && !STMT_VINFO_DATA_REF (stmt_info
)
630 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
632 && !STMT_VINFO_DATA_REF (def_stmt_info
)
633 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
634 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
635 || (!STMT_VINFO_DATA_REF (stmt_info
)
636 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
638 || STMT_VINFO_DATA_REF (def_stmt_info
)
639 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
640 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
641 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
643 /* Try swapping operands if we got a mismatch. For BB
644 vectorization only in case it will clearly improve things. */
645 if (i
== commutative_op
&& !swapped
646 && (!is_a
<bb_vec_info
> (vinfo
)
647 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
649 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
650 || vect_def_types_match
651 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE
, vect_location
,
655 "trying swapped operands\n");
656 std::swap (dts
[i
], dts
[i
+1]);
657 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
658 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
659 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
660 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
665 if (is_a
<bb_vec_info
> (vinfo
)
666 && !oprnd_info
->any_pattern
)
668 /* Now for commutative ops we should see whether we can
669 make the other operand matching. */
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
672 "treating operand as external\n");
673 oprnd_info
->first_dt
= dt
= vect_external_def
;
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
679 "Build SLP failed: different types\n");
684 /* Make sure to demote the overall operand to external. */
685 if (dt
== vect_external_def
)
686 oprnd_info
->first_dt
= vect_external_def
;
687 /* For a SLP reduction chain we want to duplicate the reduction to
688 each of the chain members. That gets us a sane SLP graph (still
689 the stmts are not 100% correct wrt the initial values). */
690 else if ((dt
== vect_internal_def
691 || dt
== vect_reduction_def
)
692 && oprnd_info
->first_dt
== vect_reduction_def
693 && !STMT_VINFO_DATA_REF (stmt_info
)
694 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
695 && !STMT_VINFO_DATA_REF (def_stmt_info
)
696 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
697 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
699 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
700 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
709 if (dump_enabled_p ())
710 dump_printf_loc (MSG_NOTE
, vect_location
,
711 "swapped operands to match def types in %G",
718 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
719 Return true if we can, meaning that this choice doesn't conflict with
720 existing SLP nodes that use STMT_INFO. */
723 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
725 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
727 return useless_type_conversion_p (vectype
, old_vectype
);
729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
731 /* We maintain the invariant that if any statement in the group is
732 used, all other members of the group have the same vector type. */
733 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
734 stmt_vec_info member_info
= first_info
;
735 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
736 if (is_pattern_stmt_p (member_info
)
737 && !useless_type_conversion_p (vectype
,
738 STMT_VINFO_VECTYPE (member_info
)))
743 for (member_info
= first_info
; member_info
;
744 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
745 STMT_VINFO_VECTYPE (member_info
) = vectype
;
749 else if (!is_pattern_stmt_p (stmt_info
))
751 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
755 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
758 "Build SLP failed: incompatible vector"
759 " types for: %G", stmt_info
->stmt
);
760 dump_printf_loc (MSG_NOTE
, vect_location
,
761 " old vector type: %T\n", old_vectype
);
762 dump_printf_loc (MSG_NOTE
, vect_location
,
763 " new vector type: %T\n", vectype
);
768 /* Return true if call statements CALL1 and CALL2 are similar enough
769 to be combined into the same SLP group. */
772 compatible_calls_p (gcall
*call1
, gcall
*call2
)
774 unsigned int nargs
= gimple_call_num_args (call1
);
775 if (nargs
!= gimple_call_num_args (call2
))
778 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
781 if (gimple_call_internal_p (call1
))
783 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
784 TREE_TYPE (gimple_call_lhs (call2
))))
786 for (unsigned int i
= 0; i
< nargs
; ++i
)
787 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
788 TREE_TYPE (gimple_call_arg (call2
, i
))))
793 if (!operand_equal_p (gimple_call_fn (call1
),
794 gimple_call_fn (call2
), 0))
797 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
803 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
804 caller's attempt to find the vector type in STMT_INFO with the narrowest
805 element type. Return true if VECTYPE is nonnull and if it is valid
806 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
807 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
808 vect_build_slp_tree. */
811 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
812 unsigned int group_size
,
813 tree vectype
, poly_uint64
*max_nunits
)
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
819 "Build SLP failed: unsupported data-type in %G\n",
821 /* Fatal mismatch. */
825 /* If populating the vector type requires unrolling then fail
826 before adjusting *max_nunits for basic-block vectorization. */
827 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
828 unsigned HOST_WIDE_INT const_nunits
;
829 if (is_a
<bb_vec_info
> (vinfo
)
830 && (!nunits
.is_constant (&const_nunits
)
831 || const_nunits
> group_size
))
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
835 "Build SLP failed: unrolling required "
836 "in basic block SLP\n");
837 /* Fatal mismatch. */
841 /* In case of multiple types we need to detect the smallest type. */
842 vect_update_max_nunits (max_nunits
, vectype
);
846 /* Verify if the scalar stmts STMTS are isomorphic, require data
847 permutation or are of unsupported types of operation. Return
848 true if they are, otherwise return false and indicate in *MATCHES
849 which stmts are not isomorphic to the first one. If MATCHES[0]
850 is false then this indicates the comparison could not be
851 carried out or the stmts will never be vectorized by SLP.
853 Note COND_EXPR is possibly isomorphic to another one after swapping its
854 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
855 the first stmt by swapping the two operands of comparison; set SWAP[i]
856 to 2 if stmt I is isormorphic to the first stmt by inverting the code
857 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
858 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
861 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
862 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
863 poly_uint64
*max_nunits
, bool *matches
,
864 bool *two_operators
, tree
*node_vectype
)
867 stmt_vec_info first_stmt_info
= stmts
[0];
868 enum tree_code first_stmt_code
= ERROR_MARK
;
869 enum tree_code alt_stmt_code
= ERROR_MARK
;
870 enum tree_code rhs_code
= ERROR_MARK
;
871 enum tree_code first_cond_code
= ERROR_MARK
;
873 bool need_same_oprnds
= false;
874 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
877 machine_mode optab_op2_mode
;
878 machine_mode vec_mode
;
879 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
880 bool first_stmt_load_p
= false, load_p
= false;
881 bool first_stmt_phi_p
= false, phi_p
= false;
883 /* For every stmt in NODE find its def stmt/s. */
884 stmt_vec_info stmt_info
;
885 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
887 gimple
*stmt
= stmt_info
->stmt
;
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
894 /* Fail to vectorize statements marked as unvectorizable, throw
896 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
897 || stmt_can_throw_internal (cfun
, stmt
)
898 || gimple_has_volatile_ops (stmt
))
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
902 "Build SLP failed: unvectorizable statement %G",
904 /* ??? For BB vectorization we want to commutate operands in a way
905 to shuffle all unvectorizable defs into one operand and have
906 the other still vectorized. The following doesn't reliably
907 work for this though but it's the easiest we can do here. */
908 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
910 /* Fatal mismatch. */
915 lhs
= gimple_get_lhs (stmt
);
916 if (lhs
== NULL_TREE
)
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
920 "Build SLP failed: not GIMPLE_ASSIGN nor "
921 "GIMPLE_CALL %G", stmt
);
922 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
924 /* Fatal mismatch. */
930 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
931 &nunits_vectype
, group_size
)
933 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
934 nunits_vectype
, max_nunits
)))
936 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
938 /* Fatal mismatch. */
943 gcc_assert (vectype
);
945 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
948 rhs_code
= CALL_EXPR
;
950 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
952 else if ((gimple_call_internal_p (call_stmt
)
953 && (!vectorizable_internal_fn_p
954 (gimple_call_internal_fn (call_stmt
))))
955 || gimple_call_tail_p (call_stmt
)
956 || gimple_call_noreturn_p (call_stmt
)
957 || !gimple_call_nothrow_p (call_stmt
)
958 || gimple_call_chain (call_stmt
))
960 if (dump_enabled_p ())
961 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
962 "Build SLP failed: unsupported call type %G",
964 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
966 /* Fatal mismatch. */
971 else if (gimple_code (stmt
) == GIMPLE_PHI
)
973 rhs_code
= ERROR_MARK
;
978 rhs_code
= gimple_assign_rhs_code (stmt
);
979 load_p
= gimple_vuse (stmt
);
982 /* Check the operation. */
985 *node_vectype
= vectype
;
986 first_stmt_code
= rhs_code
;
987 first_stmt_load_p
= load_p
;
988 first_stmt_phi_p
= phi_p
;
990 /* Shift arguments should be equal in all the packed stmts for a
991 vector shift with scalar shift operand. */
992 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
993 || rhs_code
== LROTATE_EXPR
994 || rhs_code
== RROTATE_EXPR
)
996 vec_mode
= TYPE_MODE (vectype
);
998 /* First see if we have a vector/vector shift. */
999 optab
= optab_for_tree_code (rhs_code
, vectype
,
1003 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1005 /* No vector/vector shift, try for a vector/scalar shift. */
1006 optab
= optab_for_tree_code (rhs_code
, vectype
,
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1013 "Build SLP failed: no optab.\n");
1014 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1016 /* Fatal mismatch. */
1020 icode
= (int) optab_handler (optab
, vec_mode
);
1021 if (icode
== CODE_FOR_nothing
)
1023 if (dump_enabled_p ())
1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1025 "Build SLP failed: "
1026 "op not supported by target.\n");
1027 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1029 /* Fatal mismatch. */
1033 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1034 if (!VECTOR_MODE_P (optab_op2_mode
))
1036 need_same_oprnds
= true;
1037 first_op1
= gimple_assign_rhs2 (stmt
);
1041 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1043 need_same_oprnds
= true;
1044 first_op1
= gimple_assign_rhs2 (stmt
);
1047 && rhs_code
== BIT_FIELD_REF
)
1049 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1050 if (TREE_CODE (vec
) != SSA_NAME
1051 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
1053 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1055 if (dump_enabled_p ())
1056 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1057 "Build SLP failed: "
1058 "BIT_FIELD_REF not supported\n");
1059 /* Fatal mismatch. */
1065 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1067 need_same_oprnds
= true;
1068 first_op1
= gimple_call_arg (call_stmt
, 1);
1073 if (first_stmt_code
!= rhs_code
1074 && alt_stmt_code
== ERROR_MARK
)
1075 alt_stmt_code
= rhs_code
;
1076 if ((first_stmt_code
!= rhs_code
1077 && (first_stmt_code
!= IMAGPART_EXPR
1078 || rhs_code
!= REALPART_EXPR
)
1079 && (first_stmt_code
!= REALPART_EXPR
1080 || rhs_code
!= IMAGPART_EXPR
)
1081 /* Handle mismatches in plus/minus by computing both
1082 and merging the results. */
1083 && !((first_stmt_code
== PLUS_EXPR
1084 || first_stmt_code
== MINUS_EXPR
)
1085 && (alt_stmt_code
== PLUS_EXPR
1086 || alt_stmt_code
== MINUS_EXPR
)
1087 && rhs_code
== alt_stmt_code
)
1088 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1089 && (first_stmt_code
== ARRAY_REF
1090 || first_stmt_code
== BIT_FIELD_REF
1091 || first_stmt_code
== INDIRECT_REF
1092 || first_stmt_code
== COMPONENT_REF
1093 || first_stmt_code
== MEM_REF
)))
1094 || first_stmt_load_p
!= load_p
1095 || first_stmt_phi_p
!= phi_p
)
1097 if (dump_enabled_p ())
1099 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1100 "Build SLP failed: different operation "
1101 "in stmt %G", stmt
);
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1103 "original stmt %G", first_stmt_info
->stmt
);
1109 if (need_same_oprnds
)
1111 tree other_op1
= (call_stmt
1112 ? gimple_call_arg (call_stmt
, 1)
1113 : gimple_assign_rhs2 (stmt
));
1114 if (!operand_equal_p (first_op1
, other_op1
, 0))
1116 if (dump_enabled_p ())
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1118 "Build SLP failed: different shift "
1119 "arguments in %G", stmt
);
1125 && first_stmt_code
== BIT_FIELD_REF
1126 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1127 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1131 "Build SLP failed: different BIT_FIELD_REF "
1132 "arguments in %G", stmt
);
1137 if (!load_p
&& rhs_code
== CALL_EXPR
)
1139 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1140 as_a
<gcall
*> (stmt
)))
1142 if (dump_enabled_p ())
1143 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1144 "Build SLP failed: different calls in %G",
1152 && (gimple_bb (first_stmt_info
->stmt
)
1153 != gimple_bb (stmt_info
->stmt
)))
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1157 "Build SLP failed: different BB for PHI "
1163 if (!types_compatible_p (vectype
, *node_vectype
))
1165 if (dump_enabled_p ())
1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1167 "Build SLP failed: different vector type "
1174 /* Grouped store or load. */
1175 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1177 if (REFERENCE_CLASS_P (lhs
))
1185 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1186 if (prev_first_load
)
1188 /* Check that there are no loads from different interleaving
1189 chains in the same node. */
1190 if (prev_first_load
!= first_load
)
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1195 "Build SLP failed: different "
1196 "interleaving chains in one node %G",
1203 prev_first_load
= first_load
;
1205 } /* Grouped access. */
1210 /* Not grouped load. */
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1213 "Build SLP failed: not grouped load %G", stmt
);
1215 /* FORNOW: Not grouped loads are not supported. */
1216 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1218 /* Fatal mismatch. */
1223 /* Not memory operation. */
1225 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1226 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1227 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1228 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1229 && rhs_code
!= VIEW_CONVERT_EXPR
1230 && rhs_code
!= CALL_EXPR
1231 && rhs_code
!= BIT_FIELD_REF
)
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1235 "Build SLP failed: operation unsupported %G",
1237 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1239 /* Fatal mismatch. */
1244 if (rhs_code
== COND_EXPR
)
1246 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1247 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1248 enum tree_code swap_code
= ERROR_MARK
;
1249 enum tree_code invert_code
= ERROR_MARK
;
1252 first_cond_code
= TREE_CODE (cond_expr
);
1253 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1255 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1256 swap_code
= swap_tree_comparison (cond_code
);
1257 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1260 if (first_cond_code
== cond_code
)
1262 /* Isomorphic can be achieved by swapping. */
1263 else if (first_cond_code
== swap_code
)
1265 /* Isomorphic can be achieved by inverting. */
1266 else if (first_cond_code
== invert_code
)
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1272 "Build SLP failed: different"
1273 " operation %G", stmt
);
1283 for (i
= 0; i
< group_size
; ++i
)
1287 /* If we allowed a two-operation SLP node verify the target can cope
1288 with the permute we are going to use. */
1289 if (alt_stmt_code
!= ERROR_MARK
1290 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1292 *two_operators
= true;
1298 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1299 Note we never remove apart from at destruction time so we do not
1300 need a special value for deleted that differs from empty. */
1303 typedef vec
<stmt_vec_info
> value_type
;
1304 typedef vec
<stmt_vec_info
> compare_type
;
1305 static inline hashval_t
hash (value_type
);
1306 static inline bool equal (value_type existing
, value_type candidate
);
1307 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1308 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1309 static const bool empty_zero_p
= true;
1310 static inline void mark_empty (value_type
&x
) { x
.release (); }
1311 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1312 static inline void remove (value_type
&x
) { x
.release (); }
1315 bst_traits::hash (value_type x
)
1318 for (unsigned i
= 0; i
< x
.length (); ++i
)
1319 h
.add_int (gimple_uid (x
[i
]->stmt
));
1323 bst_traits::equal (value_type existing
, value_type candidate
)
1325 if (existing
.length () != candidate
.length ())
1327 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1328 if (existing
[i
] != candidate
[i
])
1333 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1334 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1335 scalar_stmts_to_slp_tree_map_t
;
1338 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1339 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1340 poly_uint64
*max_nunits
,
1341 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1342 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1345 vect_build_slp_tree (vec_info
*vinfo
,
1346 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1347 poly_uint64
*max_nunits
,
1348 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1349 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1351 if (slp_tree
*leader
= bst_map
->get (stmts
))
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1355 *leader
? "" : "failed ", *leader
);
1358 SLP_TREE_REF_COUNT (*leader
)++;
1359 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1364 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1365 so we can pick up backedge destinations during discovery. */
1366 slp_tree res
= new _slp_tree
;
1367 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1368 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1369 bst_map
->put (stmts
.copy (), res
);
1371 poly_uint64 this_max_nunits
= 1;
1372 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1374 matches
, npermutes
, tree_size
, bst_map
);
1377 bool existed_p
= bst_map
->put (stmts
, NULL
);
1378 gcc_assert (existed_p
);
1379 /* Mark the node invalid so we can detect those when still in use
1380 as backedge destinations. */
1381 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1382 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1383 vect_free_slp_tree (res
);
1387 gcc_assert (res_
== res
);
1388 res
->max_nunits
= this_max_nunits
;
1389 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1390 /* Keep a reference for the bst_map use. */
1391 SLP_TREE_REF_COUNT (res
)++;
1396 /* Recursively build an SLP tree starting from NODE.
1397 Fail (and return a value not equal to zero) if def-stmts are not
1398 isomorphic, require data permutation or are of unsupported types of
1399 operation. Otherwise, return 0.
1400 The value returned is the depth in the SLP tree where a mismatch
1404 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1405 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1406 poly_uint64
*max_nunits
,
1407 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1408 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1410 unsigned nops
, i
, this_tree_size
= 0;
1411 poly_uint64 this_max_nunits
= *max_nunits
;
1415 stmt_vec_info stmt_info
= stmts
[0];
1416 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1417 nops
= gimple_call_num_args (stmt
);
1418 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1420 nops
= gimple_num_ops (stmt
) - 1;
1421 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1424 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1425 nops
= gimple_phi_num_args (phi
);
1429 /* If the SLP node is a PHI (induction or reduction), terminate
1431 bool *skip_args
= XALLOCAVEC (bool, nops
);
1432 memset (skip_args
, 0, sizeof (bool) * nops
);
1433 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1434 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1436 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1437 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1439 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1443 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1444 if (def_type
== vect_induction_def
)
1446 /* Induction PHIs are not cycles but walk the initial
1447 value. Only for inner loops through, for outer loops
1448 we need to pick up the value from the actual PHIs
1449 to more easily support peeling and epilogue vectorization. */
1450 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1451 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1452 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1455 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1457 else if (def_type
== vect_reduction_def
1458 || def_type
== vect_double_reduction_def
1459 || def_type
== vect_nested_cycle
)
1461 /* Else def types have to match. */
1462 stmt_vec_info other_info
;
1463 bool all_same
= true;
1464 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1466 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1468 if (other_info
!= stmt_info
)
1471 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1472 /* Reduction initial values are not explicitely represented. */
1473 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1474 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1475 /* Reduction chain backedge defs are filled manually.
1476 ??? Need a better way to identify a SLP reduction chain PHI.
1477 Or a better overall way to SLP match those. */
1478 if (all_same
&& def_type
== vect_reduction_def
)
1479 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1481 else if (def_type
!= vect_internal_def
)
1486 bool two_operators
= false;
1487 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1488 tree vectype
= NULL_TREE
;
1489 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1490 &this_max_nunits
, matches
, &two_operators
,
1494 /* If the SLP node is a load, terminate the recursion unless masked. */
1495 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1496 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1498 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1501 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1506 *max_nunits
= this_max_nunits
;
1508 node
= vect_create_new_slp_node (node
, stmts
, 0);
1509 SLP_TREE_VECTYPE (node
) = vectype
;
1510 /* And compute the load permutation. Whether it is actually
1511 a permutation depends on the unrolling factor which is
1513 vec
<unsigned> load_permutation
;
1515 stmt_vec_info load_info
;
1516 load_permutation
.create (group_size
);
1517 stmt_vec_info first_stmt_info
1518 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1519 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1521 int load_place
= vect_get_place_in_interleaving_chain
1522 (load_info
, first_stmt_info
);
1523 gcc_assert (load_place
!= -1);
1524 load_permutation
.safe_push (load_place
);
1526 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1530 else if (gimple_assign_single_p (stmt_info
->stmt
)
1531 && !gimple_vuse (stmt_info
->stmt
)
1532 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1534 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1535 the same SSA name vector of a compatible type to vectype. */
1536 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1537 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1538 stmt_vec_info estmt_info
;
1539 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1541 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1542 tree bfref
= gimple_assign_rhs1 (estmt
);
1544 if (!known_eq (bit_field_size (bfref
),
1545 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1546 || !constant_multiple_p (bit_field_offset (bfref
),
1547 bit_field_size (bfref
), &lane
))
1552 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1554 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1555 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1556 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1557 /* We are always building a permutation node even if it is an identity
1558 permute to shield the rest of the vectorizer from the odd node
1559 representing an actual vector without any scalar ops.
1560 ??? We could hide it completely with making the permute node
1562 node
= vect_create_new_slp_node (node
, stmts
, 1);
1563 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1564 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1565 SLP_TREE_VECTYPE (node
) = vectype
;
1566 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1570 /* Get at the operands, verifying they are compatible. */
1571 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1572 slp_oprnd_info oprnd_info
;
1573 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1575 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1576 stmts
, i
, &oprnds_info
);
1578 matches
[(res
== -1) ? 0 : i
] = false;
1582 for (i
= 0; i
< group_size
; ++i
)
1585 vect_free_oprnd_info (oprnds_info
);
1590 auto_vec
<slp_tree
, 4> children
;
1592 stmt_info
= stmts
[0];
1594 /* Create SLP_TREE nodes for the definition node/s. */
1595 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1600 /* We're skipping certain operands from processing, for example
1601 outer loop reduction initial defs. */
1604 children
.safe_push (NULL
);
1608 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1610 /* COND_EXPR have one too many eventually if the condition
1612 gcc_assert (i
== 3 && nops
== 4);
1616 if (is_a
<bb_vec_info
> (vinfo
)
1617 && oprnd_info
->first_dt
== vect_internal_def
1618 && !oprnd_info
->any_pattern
)
1620 /* For BB vectorization, if all defs are the same do not
1621 bother to continue the build along the single-lane
1622 graph but use a splat of the scalar value. */
1623 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1624 for (j
= 1; j
< group_size
; ++j
)
1625 if (oprnd_info
->def_stmts
[j
] != first_def
)
1628 /* But avoid doing this for loads where we may be
1629 able to CSE things, unless the stmt is not
1631 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1632 || !gimple_vuse (first_def
->stmt
)))
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_NOTE
, vect_location
,
1636 "Using a splat of the uniform operand\n");
1637 oprnd_info
->first_dt
= vect_external_def
;
1641 if (oprnd_info
->first_dt
== vect_external_def
1642 || oprnd_info
->first_dt
== vect_constant_def
)
1644 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1645 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1646 oprnd_info
->ops
= vNULL
;
1647 children
.safe_push (invnode
);
1651 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1652 group_size
, &this_max_nunits
,
1654 &this_tree_size
, bst_map
)) != NULL
)
1656 oprnd_info
->def_stmts
= vNULL
;
1657 children
.safe_push (child
);
1661 /* If the SLP build for operand zero failed and operand zero
1662 and one can be commutated try that for the scalar stmts
1663 that failed the match. */
1665 /* A first scalar stmt mismatch signals a fatal mismatch. */
1667 /* ??? For COND_EXPRs we can swap the comparison operands
1668 as well as the arms under some constraints. */
1670 && oprnds_info
[1]->first_dt
== vect_internal_def
1671 && is_gimple_assign (stmt_info
->stmt
)
1672 /* Swapping operands for reductions breaks assumptions later on. */
1673 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1674 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1675 /* Do so only if the number of not successful permutes was nor more
1676 than a cut-ff as re-trying the recursive match on
1677 possibly each level of the tree would expose exponential
1681 /* See whether we can swap the matching or the non-matching
1683 bool swap_not_matching
= true;
1686 for (j
= 0; j
< group_size
; ++j
)
1688 if (matches
[j
] != !swap_not_matching
)
1690 stmt_vec_info stmt_info
= stmts
[j
];
1691 /* Verify if we can swap operands of this stmt. */
1692 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1694 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1696 if (!swap_not_matching
)
1698 swap_not_matching
= false;
1703 while (j
!= group_size
);
1705 /* Swap mismatched definition stmts. */
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE
, vect_location
,
1708 "Re-trying with swapped operands of stmts ");
1709 for (j
= 0; j
< group_size
; ++j
)
1710 if (matches
[j
] == !swap_not_matching
)
1712 std::swap (oprnds_info
[0]->def_stmts
[j
],
1713 oprnds_info
[1]->def_stmts
[j
]);
1714 std::swap (oprnds_info
[0]->ops
[j
],
1715 oprnds_info
[1]->ops
[j
]);
1716 if (dump_enabled_p ())
1717 dump_printf (MSG_NOTE
, "%d ", j
);
1719 if (dump_enabled_p ())
1720 dump_printf (MSG_NOTE
, "\n");
1721 /* And try again with scratch 'matches' ... */
1722 bool *tem
= XALLOCAVEC (bool, group_size
);
1723 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1724 group_size
, &this_max_nunits
,
1726 &this_tree_size
, bst_map
)) != NULL
)
1728 oprnd_info
->def_stmts
= vNULL
;
1729 children
.safe_push (child
);
1732 /* We do not undo the swapping here since it might still be
1733 the better order for the second operand in case we build
1734 the first one from scalars below. */
1739 /* If the SLP build failed and we analyze a basic-block
1740 simply treat nodes we fail to build as externally defined
1741 (and thus build vectors from the scalar defs).
1742 The cost model will reject outright expensive cases.
1743 ??? This doesn't treat cases where permutation ultimatively
1744 fails (or we don't try permutation below). Ideally we'd
1745 even compute a permutation that will end up with the maximum
1747 if (is_a
<bb_vec_info
> (vinfo
)
1748 /* ??? Rejecting patterns this way doesn't work. We'd have to
1749 do extra work to cancel the pattern so the uses see the
1751 && !is_pattern_stmt_p (stmt_info
)
1752 && !oprnd_info
->any_pattern
)
1754 /* But if there's a leading vector sized set of matching stmts
1755 fail here so we can split the group. This matches the condition
1756 vect_analyze_slp_instance uses. */
1757 /* ??? We might want to split here and combine the results to support
1758 multiple vector sizes better. */
1759 for (j
= 0; j
< group_size
; ++j
)
1762 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1764 if (dump_enabled_p ())
1765 dump_printf_loc (MSG_NOTE
, vect_location
,
1766 "Building vector operands from scalars\n");
1768 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1769 children
.safe_push (child
);
1770 oprnd_info
->ops
= vNULL
;
1775 gcc_assert (child
== NULL
);
1776 FOR_EACH_VEC_ELT (children
, j
, child
)
1778 vect_free_slp_tree (child
);
1779 vect_free_oprnd_info (oprnds_info
);
1783 vect_free_oprnd_info (oprnds_info
);
1785 /* If we have all children of a child built up from uniform scalars
1786 or does more than one possibly expensive vector construction then
1787 just throw that away, causing it built up from scalars.
1788 The exception is the SLP node for the vector store. */
1789 if (is_a
<bb_vec_info
> (vinfo
)
1790 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1791 /* ??? Rejecting patterns this way doesn't work. We'd have to
1792 do extra work to cancel the pattern so the uses see the
1794 && !is_pattern_stmt_p (stmt_info
))
1798 bool all_uniform_p
= true;
1799 unsigned n_vector_builds
= 0;
1800 FOR_EACH_VEC_ELT (children
, j
, child
)
1804 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1805 all_uniform_p
= false;
1806 else if (!vect_slp_tree_uniform_p (child
))
1808 all_uniform_p
= false;
1809 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1813 if (all_uniform_p
|| n_vector_builds
> 1)
1817 FOR_EACH_VEC_ELT (children
, j
, child
)
1819 vect_free_slp_tree (child
);
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE
, vect_location
,
1823 "Building parent vector operands from "
1824 "scalars instead\n");
1829 *tree_size
+= this_tree_size
+ 1;
1830 *max_nunits
= this_max_nunits
;
1834 /* ??? We'd likely want to either cache in bst_map sth like
1835 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1836 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1837 explicit stmts to put in so the keying on 'stmts' doesn't
1838 work (but we have the same issue with nodes that use 'ops'). */
1839 slp_tree one
= new _slp_tree
;
1840 slp_tree two
= new _slp_tree
;
1841 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1842 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1843 SLP_TREE_VECTYPE (one
) = vectype
;
1844 SLP_TREE_VECTYPE (two
) = vectype
;
1845 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1846 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1849 SLP_TREE_REF_COUNT (child
)++;
1851 /* Here we record the original defs since this
1852 node represents the final lane configuration. */
1853 node
= vect_create_new_slp_node (node
, stmts
, 2);
1854 SLP_TREE_VECTYPE (node
) = vectype
;
1855 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1856 SLP_TREE_CHILDREN (node
).quick_push (one
);
1857 SLP_TREE_CHILDREN (node
).quick_push (two
);
1858 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1859 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1860 enum tree_code ocode
= ERROR_MARK
;
1861 stmt_vec_info ostmt_info
;
1863 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1865 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1866 if (gimple_assign_rhs_code (ostmt
) != code0
)
1868 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1869 ocode
= gimple_assign_rhs_code (ostmt
);
1873 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1875 SLP_TREE_CODE (one
) = code0
;
1876 SLP_TREE_CODE (two
) = ocode
;
1877 SLP_TREE_LANES (one
) = stmts
.length ();
1878 SLP_TREE_LANES (two
) = stmts
.length ();
1879 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1880 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1884 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1885 SLP_TREE_VECTYPE (node
) = vectype
;
1886 SLP_TREE_CHILDREN (node
).splice (children
);
1890 /* Dump a single SLP tree NODE. */
1893 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1898 stmt_vec_info stmt_info
;
1901 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1902 dump_user_location_t user_loc
= loc
.get_user_location ();
1903 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1904 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1906 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1909 estimated_poly_value (node
->max_nunits
),
1910 SLP_TREE_REF_COUNT (node
));
1911 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
1913 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
1914 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
1916 dump_printf_loc (metadata
, user_loc
, "op template: %G",
1917 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
1919 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1920 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1921 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1924 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1925 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1926 dump_printf (metadata
, "%T%s ", op
,
1927 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1928 dump_printf (metadata
, "}\n");
1930 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1932 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1933 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1934 dump_printf (dump_kind
, " %u", j
);
1935 dump_printf (dump_kind
, " }\n");
1937 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1939 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1940 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1941 dump_printf (dump_kind
, " %u[%u]",
1942 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1943 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1944 dump_printf (dump_kind
, " }\n");
1946 if (SLP_TREE_CHILDREN (node
).is_empty ())
1948 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1949 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1950 dump_printf (dump_kind
, " %p", (void *)child
);
1951 dump_printf (dump_kind
, "\n");
1955 debug (slp_tree node
)
1957 debug_dump_context ctx
;
1958 vect_print_slp_tree (MSG_NOTE
,
1959 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1963 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1966 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1967 slp_tree node
, hash_set
<slp_tree
> &visited
)
1972 if (visited
.add (node
))
1975 vect_print_slp_tree (dump_kind
, loc
, node
);
1977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1979 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1983 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1986 hash_set
<slp_tree
> visited
;
1987 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1990 /* Mark the tree rooted at NODE with PURE_SLP. */
1993 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1996 stmt_vec_info stmt_info
;
1999 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2002 if (visited
.add (node
))
2005 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2006 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2008 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2010 vect_mark_slp_stmts (child
, visited
);
2014 vect_mark_slp_stmts (slp_tree node
)
2016 hash_set
<slp_tree
> visited
;
2017 vect_mark_slp_stmts (node
, visited
);
2020 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2023 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2026 stmt_vec_info stmt_info
;
2029 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2032 if (visited
.add (node
))
2035 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2037 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2038 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2039 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2042 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2044 vect_mark_slp_stmts_relevant (child
, visited
);
2048 vect_mark_slp_stmts_relevant (slp_tree node
)
2050 hash_set
<slp_tree
> visited
;
2051 vect_mark_slp_stmts_relevant (node
, visited
);
2055 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2058 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2059 hash_set
<slp_tree
> &visited
)
2061 if (!node
|| visited
.add (node
))
2064 if (SLP_TREE_CHILDREN (node
).length () == 0)
2066 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2068 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2069 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2070 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2071 loads
.safe_push (node
);
2077 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2078 vect_gather_slp_loads (loads
, child
, visited
);
2083 /* Find the last store in SLP INSTANCE. */
2086 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2088 stmt_vec_info last
= NULL
;
2089 stmt_vec_info stmt_vinfo
;
2091 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2093 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2094 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2100 /* Find the first stmt in NODE. */
2103 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2105 stmt_vec_info first
= NULL
;
2106 stmt_vec_info stmt_vinfo
;
2108 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2110 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2112 || get_later_stmt (stmt_vinfo
, first
) == first
)
2119 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2120 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2121 (also containing the first GROUP1_SIZE stmts, since stores are
2122 consecutive), the second containing the remainder.
2123 Return the first stmt in the second group. */
2125 static stmt_vec_info
2126 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2128 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2129 gcc_assert (group1_size
> 0);
2130 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2131 gcc_assert (group2_size
> 0);
2132 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2134 stmt_vec_info stmt_info
= first_vinfo
;
2135 for (unsigned i
= group1_size
; i
> 1; i
--)
2137 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2138 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2140 /* STMT is now the last element of the first group. */
2141 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2142 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2144 DR_GROUP_SIZE (group2
) = group2_size
;
2145 for (stmt_info
= group2
; stmt_info
;
2146 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2148 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2149 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2152 /* For the second group, the DR_GROUP_GAP is that before the original group,
2153 plus skipping over the first vector. */
2154 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2156 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2157 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2159 if (dump_enabled_p ())
2160 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2161 group1_size
, group2_size
);
2166 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2167 statements and a vector of NUNITS elements. */
2170 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2172 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2176 vect_analyze_slp_instance (vec_info
*vinfo
,
2177 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2178 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2179 unsigned max_tree_size
);
2181 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2182 of KIND. Return true if successful. */
2185 vect_build_slp_instance (vec_info
*vinfo
,
2186 slp_instance_kind kind
,
2187 vec
<stmt_vec_info
> scalar_stmts
,
2188 stmt_vec_info root_stmt_info
,
2189 unsigned max_tree_size
,
2190 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2191 /* ??? We need stmt_info for group splitting. */
2192 stmt_vec_info stmt_info_
)
2194 if (dump_enabled_p ())
2196 dump_printf_loc (MSG_NOTE
, vect_location
,
2197 "Starting SLP discovery for\n");
2198 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2199 dump_printf_loc (MSG_NOTE
, vect_location
,
2200 " %G", scalar_stmts
[i
]->stmt
);
2203 /* Build the tree for the SLP instance. */
2204 unsigned int group_size
= scalar_stmts
.length ();
2205 bool *matches
= XALLOCAVEC (bool, group_size
);
2206 unsigned npermutes
= 0;
2207 poly_uint64 max_nunits
= 1;
2208 unsigned tree_size
= 0;
2210 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2211 &max_nunits
, matches
, &npermutes
,
2212 &tree_size
, bst_map
);
2215 /* Calculate the unrolling factor based on the smallest type. */
2216 poly_uint64 unrolling_factor
2217 = calculate_unrolling_factor (max_nunits
, group_size
);
2219 if (maybe_ne (unrolling_factor
, 1U)
2220 && is_a
<bb_vec_info
> (vinfo
))
2222 unsigned HOST_WIDE_INT const_max_nunits
;
2223 if (!max_nunits
.is_constant (&const_max_nunits
)
2224 || const_max_nunits
> group_size
)
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2228 "Build SLP failed: store group "
2229 "size not a multiple of the vector size "
2230 "in basic block SLP\n");
2231 vect_free_slp_tree (node
);
2234 /* Fatal mismatch. */
2235 if (dump_enabled_p ())
2236 dump_printf_loc (MSG_NOTE
, vect_location
,
2237 "SLP discovery succeeded but node needs "
2239 memset (matches
, true, group_size
);
2240 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2241 vect_free_slp_tree (node
);
2245 /* Create a new SLP instance. */
2246 slp_instance new_instance
= XNEW (class _slp_instance
);
2247 SLP_INSTANCE_TREE (new_instance
) = node
;
2248 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2249 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2250 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2251 SLP_INSTANCE_KIND (new_instance
) = kind
;
2252 new_instance
->reduc_phis
= NULL
;
2253 new_instance
->cost_vec
= vNULL
;
2254 new_instance
->subgraph_entries
= vNULL
;
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_NOTE
, vect_location
,
2258 "SLP size %u vs. limit %u.\n",
2259 tree_size
, max_tree_size
);
2261 /* Fixup SLP reduction chains. */
2262 if (kind
== slp_inst_kind_reduc_chain
)
2264 /* If this is a reduction chain with a conversion in front
2265 amend the SLP tree with a node for that. */
2267 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2268 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2270 /* Get at the conversion stmt - we know it's the single use
2271 of the last stmt of the reduction chain. */
2272 use_operand_p use_p
;
2273 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2274 &use_p
, &scalar_def
);
2276 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2277 next_info
= vect_stmt_to_vectorize (next_info
);
2278 scalar_stmts
= vNULL
;
2279 scalar_stmts
.create (group_size
);
2280 for (unsigned i
= 0; i
< group_size
; ++i
)
2281 scalar_stmts
.quick_push (next_info
);
2282 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2283 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2284 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2285 SLP_INSTANCE_TREE (new_instance
) = conv
;
2286 /* We also have to fake this conversion stmt as SLP reduction
2287 group so we don't have to mess with too much code
2289 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2290 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2292 /* Fill the backedge child of the PHI SLP node. The
2293 general matching code cannot find it because the
2294 scalar code does not reflect how we vectorize the
2296 use_operand_p use_p
;
2297 imm_use_iterator imm_iter
;
2298 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2299 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2300 gimple_get_lhs (scalar_def
))
2301 /* There are exactly two non-debug uses, the reduction
2302 PHI and the loop-closed PHI node. */
2303 if (!is_gimple_debug (USE_STMT (use_p
))
2304 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2306 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2307 stmt_vec_info phi_info
2308 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2309 for (unsigned i
= 0; i
< group_size
; ++i
)
2310 phis
.quick_push (phi_info
);
2311 slp_tree
*phi_node
= bst_map
->get (phis
);
2312 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2313 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2314 = SLP_INSTANCE_TREE (new_instance
);
2315 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2319 vinfo
->slp_instances
.safe_push (new_instance
);
2321 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2322 the number of scalar stmts in the root in a few places.
2323 Verify that assumption holds. */
2324 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2325 .length () == group_size
);
2327 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_NOTE
, vect_location
,
2330 "Final SLP tree for instance %p:\n", new_instance
);
2331 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2332 SLP_INSTANCE_TREE (new_instance
));
2340 /* Failed to SLP. */
2341 /* Free the allocated memory. */
2342 scalar_stmts
.release ();
2345 stmt_vec_info stmt_info
= stmt_info_
;
2346 /* Try to break the group up into pieces. */
2347 if (kind
== slp_inst_kind_store
)
2349 /* ??? We could delay all the actual splitting of store-groups
2350 until after SLP discovery of the original group completed.
2351 Then we can recurse to vect_build_slp_instance directly. */
2352 for (i
= 0; i
< group_size
; i
++)
2356 /* For basic block SLP, try to break the group up into multiples of
2358 if (is_a
<bb_vec_info
> (vinfo
)
2359 && (i
> 1 && i
< group_size
))
2362 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2363 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2364 1 << floor_log2 (i
));
2365 unsigned HOST_WIDE_INT const_nunits
;
2367 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2369 /* Split into two groups at the first vector boundary. */
2370 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2371 unsigned group1_size
= i
& ~(const_nunits
- 1);
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_NOTE
, vect_location
,
2375 "Splitting SLP group at stmt %u\n", i
);
2376 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2378 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2379 kind
, max_tree_size
);
2380 /* Split the rest at the failure point and possibly
2381 re-analyze the remaining matching part if it has
2382 at least two lanes. */
2384 && (i
+ 1 < group_size
2385 || i
- group1_size
> 1))
2387 stmt_vec_info rest2
= rest
;
2388 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2389 if (i
- group1_size
> 1)
2390 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2391 kind
, max_tree_size
);
2393 /* Re-analyze the non-matching tail if it has at least
2395 if (i
+ 1 < group_size
)
2396 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2397 rest
, kind
, max_tree_size
);
2402 /* For loop vectorization split into arbitrary pieces of size > 1. */
2403 if (is_a
<loop_vec_info
> (vinfo
)
2404 && (i
> 1 && i
< group_size
))
2406 unsigned group1_size
= i
;
2408 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE
, vect_location
,
2410 "Splitting SLP group at stmt %u\n", i
);
2412 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2414 /* Loop vectorization cannot handle gaps in stores, make sure
2415 the split group appears as strided. */
2416 STMT_VINFO_STRIDED_P (rest
) = 1;
2417 DR_GROUP_GAP (rest
) = 0;
2418 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2419 DR_GROUP_GAP (stmt_info
) = 0;
2421 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2422 kind
, max_tree_size
);
2423 if (i
+ 1 < group_size
)
2424 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2425 rest
, kind
, max_tree_size
);
2430 /* Even though the first vector did not all match, we might be able to SLP
2431 (some) of the remainder. FORNOW ignore this possibility. */
2434 /* Failed to SLP. */
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2441 /* Analyze an SLP instance starting from a group of grouped stores. Call
2442 vect_build_slp_tree to build a tree of packed stmts if possible.
2443 Return FALSE if it's impossible to SLP any stmt in the loop. */
2446 vect_analyze_slp_instance (vec_info
*vinfo
,
2447 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2448 stmt_vec_info stmt_info
,
2449 slp_instance_kind kind
,
2450 unsigned max_tree_size
)
2453 vec
<stmt_vec_info
> scalar_stmts
;
2455 if (is_a
<bb_vec_info
> (vinfo
))
2456 vect_location
= stmt_info
->stmt
;
2458 stmt_vec_info next_info
= stmt_info
;
2459 if (kind
== slp_inst_kind_store
)
2461 /* Collect the stores and store them in scalar_stmts. */
2462 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2465 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2466 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2469 else if (kind
== slp_inst_kind_reduc_chain
)
2471 /* Collect the reduction stmts and store them in scalar_stmts. */
2472 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2475 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2476 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2478 /* Mark the first element of the reduction chain as reduction to properly
2479 transform the node. In the reduction analysis phase only the last
2480 element of the chain is marked as reduction. */
2481 STMT_VINFO_DEF_TYPE (stmt_info
)
2482 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2483 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2484 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2486 else if (kind
== slp_inst_kind_ctor
)
2488 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2490 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2491 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2493 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2494 def_info
= vect_stmt_to_vectorize (def_info
);
2495 scalar_stmts
.quick_push (def_info
);
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE
, vect_location
,
2499 "Analyzing vectorizable constructor: %G\n",
2502 else if (kind
== slp_inst_kind_reduc_group
)
2504 /* Collect reduction statements. */
2505 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2506 scalar_stmts
.create (reductions
.length ());
2507 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2508 if (STMT_VINFO_RELEVANT_P (next_info
)
2509 || STMT_VINFO_LIVE_P (next_info
))
2510 scalar_stmts
.quick_push (next_info
);
2511 /* If less than two were relevant/live there's nothing to SLP. */
2512 if (scalar_stmts
.length () < 2)
2518 /* Build the tree for the SLP instance. */
2519 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2520 kind
== slp_inst_kind_ctor
2522 max_tree_size
, bst_map
,
2523 kind
== slp_inst_kind_store
2524 ? stmt_info
: NULL
);
2526 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2527 where we should do store group splitting. */
2532 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2533 trees of packed scalar stmts if SLP is possible. */
2536 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2539 stmt_vec_info first_element
;
2541 DUMP_VECT_SCOPE ("vect_analyze_slp");
2543 scalar_stmts_to_slp_tree_map_t
*bst_map
2544 = new scalar_stmts_to_slp_tree_map_t ();
2546 /* Find SLP sequences starting from groups of grouped stores. */
2547 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2548 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2549 STMT_VINFO_GROUPED_ACCESS (first_element
)
2550 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2553 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2555 /* Find SLP sequences starting from reduction chains. */
2556 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2557 if (! STMT_VINFO_RELEVANT_P (first_element
)
2558 && ! STMT_VINFO_LIVE_P (first_element
))
2560 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2561 slp_inst_kind_reduc_chain
,
2564 /* Dissolve reduction chain group. */
2565 stmt_vec_info vinfo
= first_element
;
2566 stmt_vec_info last
= NULL
;
2569 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2570 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2571 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2575 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2576 /* It can be still vectorized as part of an SLP reduction. */
2577 loop_vinfo
->reductions
.safe_push (last
);
2580 /* Find SLP sequences starting from groups of reductions. */
2581 if (loop_vinfo
->reductions
.length () > 1)
2582 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2583 slp_inst_kind_reduc_group
, max_tree_size
);
2586 /* The map keeps a reference on SLP nodes built, release that. */
2587 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2588 it
!= bst_map
->end (); ++it
)
2590 vect_free_slp_tree ((*it
).second
);
2593 return opt_result::success ();
2596 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2599 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2600 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2605 if (visited
.add (node
))
2608 node
->vertex
= vertices
.length ();
2609 vertices
.safe_push (node
);
2612 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2616 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2619 leafs
.safe_push (node
->vertex
);
2622 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2625 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2628 hash_set
<slp_tree
> visited
;
2630 slp_instance instance
;
2631 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2633 unsigned n_v
= vertices
.length ();
2634 unsigned n_l
= leafs
.length ();
2635 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2637 /* If we added vertices but no entries to the reverse graph we've
2638 added a cycle that is not backwards-reachable. Push the entry
2639 to mimic as leaf then. */
2640 if (vertices
.length () > n_v
2641 && leafs
.length () == n_l
)
2642 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2646 /* Apply (reverse) bijectite PERM to VEC. */
2650 vect_slp_permute (vec
<unsigned> perm
,
2651 vec
<T
> &vec
, bool reverse
)
2653 auto_vec
<T
, 64> saved
;
2654 saved
.create (vec
.length ());
2655 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2656 saved
.quick_push (vec
[i
]);
2660 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2661 vec
[perm
[i
]] = saved
[i
];
2662 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2663 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2667 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2668 vec
[i
] = saved
[perm
[i
]];
2669 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2670 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2674 /* Return whether permutations PERM_A and PERM_B as recorded in the
2675 PERMS vector are equal. */
2678 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2679 int perm_a
, int perm_b
)
2681 return (perm_a
== perm_b
2682 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2683 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2684 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2687 /* Optimize the SLP graph of VINFO. */
2690 vect_optimize_slp (vec_info
*vinfo
)
2692 if (vinfo
->slp_instances
.is_empty ())
2697 auto_vec
<slp_tree
> vertices
;
2698 auto_vec
<int> leafs
;
2699 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
2701 struct graph
*slpg
= new_graph (vertices
.length ());
2702 FOR_EACH_VEC_ELT (vertices
, i
, node
)
2706 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2708 add_edge (slpg
, i
, child
->vertex
);
2711 /* Compute (reverse) postorder on the inverted graph. */
2713 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
2715 auto_sbitmap
n_visited (vertices
.length ());
2716 auto_sbitmap
n_materialize (vertices
.length ());
2717 auto_vec
<int> n_perm (vertices
.length ());
2718 auto_vec
<vec
<unsigned> > perms
;
2720 bitmap_clear (n_visited
);
2721 bitmap_clear (n_materialize
);
2722 n_perm
.quick_grow_cleared (vertices
.length ());
2723 perms
.safe_push (vNULL
); /* zero is no permute */
2725 /* Produce initial permutations. */
2726 for (i
= 0; i
< leafs
.length (); ++i
)
2729 slp_tree node
= vertices
[idx
];
2731 /* Handle externals and constants optimistically throughout the
2733 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
2734 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
2737 /* Leafs do not change across iterations. Note leafs also double
2738 as entries to the reverse graph. */
2739 if (!slpg
->vertices
[idx
].succ
)
2740 bitmap_set_bit (n_visited
, idx
);
2741 /* Loads are the only thing generating permutes. */
2742 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2745 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2746 node unpermuted, record this permute. */
2747 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
2748 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
2750 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
2751 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
2752 bool any_permute
= false;
2753 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2755 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
2756 imin
= MIN (imin
, idx
);
2757 imax
= MAX (imax
, idx
);
2758 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
2761 /* If there's no permute no need to split one out. */
2764 /* If the span doesn't match we'd disrupt VF computation, avoid
2766 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
2769 /* For now only handle true permutes, like
2770 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2771 when permuting constants and invariants keeping the permute
2773 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
2774 bitmap_clear (load_index
);
2775 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2776 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
2778 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2779 if (!bitmap_bit_p (load_index
, j
))
2781 if (j
!= SLP_TREE_LANES (node
))
2784 vec
<unsigned> perm
= vNULL
;
2785 perm
.safe_grow (SLP_TREE_LANES (node
), true);
2786 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2787 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
2788 perms
.safe_push (perm
);
2789 n_perm
[idx
] = perms
.length () - 1;
2792 /* Propagate permutes along the graph and compute materialization points. */
2794 unsigned iteration
= 0;
2800 for (i
= vertices
.length (); i
> 0 ; --i
)
2803 slp_tree node
= vertices
[idx
];
2804 /* For leafs there's nothing to do - we've seeded permutes
2806 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2809 bitmap_set_bit (n_visited
, idx
);
2811 /* We cannot move a permute across a store. */
2812 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2814 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
2818 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
2819 succ
; succ
= succ
->succ_next
)
2821 int succ_idx
= succ
->dest
;
2822 /* Handle unvisited nodes optimistically. */
2823 /* ??? But for constants once we want to handle non-bijective
2824 permutes we have to verify the permute, when unifying lanes,
2825 will not unify different constants. For example see
2826 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2827 if (!bitmap_bit_p (n_visited
, succ_idx
))
2829 int succ_perm
= n_perm
[succ_idx
];
2830 /* Once we materialize succs permutation its output lanes
2831 appear unpermuted to us. */
2832 if (bitmap_bit_p (n_materialize
, succ_idx
))
2836 else if (succ_perm
== 0)
2841 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
2849 /* Pick up pre-computed leaf values. */
2851 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
2854 /* Make sure we eventually converge. */
2855 gcc_checking_assert (perm
== 0);
2858 bitmap_clear_bit (n_materialize
, idx
);
2865 /* Elide pruning at materialization points in the first
2866 iteration so every node was visited once at least. */
2870 /* Decide on permute materialization. Look whether there's
2871 a use (pred) edge that is permuted differently than us.
2872 In that case mark ourselves so the permutation is applied. */
2873 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
2874 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
2875 pred
; pred
= pred
->pred_next
)
2877 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
2878 int pred_perm
= n_perm
[pred
->src
];
2879 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
2881 all_preds_permuted
= false;
2885 if (!all_preds_permuted
)
2887 if (!bitmap_bit_p (n_materialize
, idx
))
2889 bitmap_set_bit (n_materialize
, idx
);
2893 while (changed
|| iteration
== 1);
2896 for (i
= 0; i
< vertices
.length (); ++i
)
2898 int perm
= n_perm
[i
];
2902 slp_tree node
= vertices
[i
];
2904 /* First permute invariant/external original successors. */
2907 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2909 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2912 /* If the vector is uniform there's nothing to do. */
2913 if (vect_slp_tree_uniform_p (child
))
2916 /* We can end up sharing some externals via two_operator
2917 handling. Be prepared to unshare those. */
2918 if (child
->refcnt
!= 1)
2920 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
2921 SLP_TREE_CHILDREN (node
)[j
] = child
2922 = vect_create_new_slp_node
2923 (SLP_TREE_SCALAR_OPS (child
).copy ());
2925 vect_slp_permute (perms
[perm
],
2926 SLP_TREE_SCALAR_OPS (child
), true);
2929 if (bitmap_bit_p (n_materialize
, i
))
2931 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2932 /* For loads simply drop the permutation, the load permutation
2933 already performs the desired permutation. */
2935 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2937 /* If the node if already a permute node we just need to apply
2938 the permutation to the permute node itself. */
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE
, vect_location
,
2941 "simplifying permute node %p\n",
2944 vect_slp_permute (perms
[perm
], SLP_TREE_LANE_PERMUTATION (node
),
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE
, vect_location
,
2951 "inserting permute node in place of %p\n",
2954 /* Make a copy of NODE and in-place change it to a
2955 VEC_PERM node to permute the lanes of the copy. */
2956 slp_tree copy
= new _slp_tree
;
2957 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
2958 SLP_TREE_CHILDREN (node
) = vNULL
;
2959 SLP_TREE_SCALAR_STMTS (copy
)
2960 = SLP_TREE_SCALAR_STMTS (node
).copy ();
2961 vect_slp_permute (perms
[perm
],
2962 SLP_TREE_SCALAR_STMTS (copy
), true);
2963 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
2964 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
2965 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
2966 SLP_TREE_LANE_PERMUTATION (copy
)
2967 = SLP_TREE_LANE_PERMUTATION (node
);
2968 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
2969 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
2971 copy
->max_nunits
= node
->max_nunits
;
2972 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
2973 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
2974 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
2976 /* Now turn NODE into a VEC_PERM. */
2977 SLP_TREE_CHILDREN (node
).safe_push (copy
);
2978 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
2979 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2980 SLP_TREE_LANE_PERMUTATION (node
)
2981 .quick_push (std::make_pair (0, perms
[perm
][j
]));
2982 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2987 /* Apply the reverse permutation to our stmts. */
2988 vect_slp_permute (perms
[perm
],
2989 SLP_TREE_SCALAR_STMTS (node
), true);
2990 /* And to the load permutation, which we can simply
2991 make regular by design. */
2992 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2994 /* ??? When we handle non-bijective permutes the idea
2995 is that we can force the load-permutation to be
2996 { min, min + 1, min + 2, ... max }. But then the
2997 scalar defs might no longer match the lane content
2998 which means wrong-code with live lane vectorization.
2999 So we possibly have to have NULL entries for those. */
3000 vect_slp_permute (perms
[perm
],
3001 SLP_TREE_LOAD_PERMUTATION (node
), true);
3006 /* Free the perms vector used for propagation. */
3007 while (!perms
.is_empty ())
3008 perms
.pop ().release ();
3012 /* Now elide load permutations that are not necessary. */
3013 for (i
= 0; i
< leafs
.length (); ++i
)
3015 node
= vertices
[leafs
[i
]];
3016 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3019 /* In basic block vectorization we allow any subchain of an interleaving
3021 FORNOW: not in loop SLP because of realignment complications. */
3022 if (is_a
<bb_vec_info
> (vinfo
))
3024 bool subchain_p
= true;
3025 stmt_vec_info next_load_info
= NULL
;
3026 stmt_vec_info load_info
;
3028 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3031 && (next_load_info
!= load_info
3032 || DR_GROUP_GAP (load_info
) != 1))
3037 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3041 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3047 stmt_vec_info load_info
;
3048 bool this_load_permuted
= false;
3050 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3051 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3053 this_load_permuted
= true;
3056 stmt_vec_info first_stmt_info
3057 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3058 if (!this_load_permuted
3059 /* The load requires permutation when unrolling exposes
3060 a gap either because the group is larger than the SLP
3061 group-size or because there is a gap between the groups. */
3062 && (known_eq (LOOP_VINFO_VECT_FACTOR
3063 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3064 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3065 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3067 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3074 /* Gather loads reachable from the individual SLP graph entries. */
3077 vect_gather_slp_loads (vec_info
*vinfo
)
3080 slp_instance instance
;
3081 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3083 hash_set
<slp_tree
> visited
;
3084 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3085 SLP_INSTANCE_TREE (instance
), visited
);
3090 /* For each possible SLP instance decide whether to SLP it and calculate overall
3091 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3092 least one instance. */
3095 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3098 poly_uint64 unrolling_factor
= 1;
3099 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3100 slp_instance instance
;
3101 int decided_to_slp
= 0;
3103 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3105 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3107 /* FORNOW: SLP if you can. */
3108 /* All unroll factors have the form:
3110 GET_MODE_SIZE (vinfo->vector_mode) * X
3112 for some rational X, so they must have a common multiple. */
3114 = force_common_multiple (unrolling_factor
,
3115 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3117 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3118 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3119 loop-based vectorization. Such stmts will be marked as HYBRID. */
3120 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3124 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3126 if (decided_to_slp
&& dump_enabled_p ())
3128 dump_printf_loc (MSG_NOTE
, vect_location
,
3129 "Decided to SLP %d instances. Unrolling factor ",
3131 dump_dec (MSG_NOTE
, unrolling_factor
);
3132 dump_printf (MSG_NOTE
, "\n");
3135 return (decided_to_slp
> 0);
3138 /* Private data for vect_detect_hybrid_slp. */
3141 loop_vec_info loop_vinfo
;
3142 vec
<stmt_vec_info
> *worklist
;
3145 /* Walker for walk_gimple_op. */
3148 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3150 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3151 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3156 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3159 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3160 if (PURE_SLP_STMT (def_stmt_info
))
3162 if (dump_enabled_p ())
3163 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3164 def_stmt_info
->stmt
);
3165 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3166 dat
->worklist
->safe_push (def_stmt_info
);
3172 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3173 if so, otherwise pushing it to WORKLIST. */
3176 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3177 vec
<stmt_vec_info
> &worklist
,
3178 stmt_vec_info stmt_info
)
3180 if (dump_enabled_p ())
3181 dump_printf_loc (MSG_NOTE
, vect_location
,
3182 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3183 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3184 imm_use_iterator iter2
;
3186 use_operand_p use_p
;
3187 def_operand_p def_p
;
3188 bool any_def
= false;
3189 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3192 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3194 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3195 /* An out-of loop use means this is a loop_vect sink. */
3198 if (dump_enabled_p ())
3199 dump_printf_loc (MSG_NOTE
, vect_location
,
3200 "Found loop_vect sink: %G", stmt_info
->stmt
);
3201 worklist
.safe_push (stmt_info
);
3204 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_NOTE
, vect_location
,
3208 "Found loop_vect use: %G", use_info
->stmt
);
3209 worklist
.safe_push (stmt_info
);
3214 /* No def means this is a loo_vect sink. */
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE
, vect_location
,
3219 "Found loop_vect sink: %G", stmt_info
->stmt
);
3220 worklist
.safe_push (stmt_info
);
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_NOTE
, vect_location
,
3225 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3226 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3229 /* Find stmts that must be both vectorized and SLPed. */
3232 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3234 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3236 /* All stmts participating in SLP are marked pure_slp, all other
3237 stmts are loop_vect.
3238 First collect all loop_vect stmts into a worklist.
3239 SLP patterns cause not all original scalar stmts to appear in
3240 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3241 Rectify this here and do a backward walk over the IL only considering
3242 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3243 mark them as pure_slp. */
3244 auto_vec
<stmt_vec_info
> worklist
;
3245 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3247 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3248 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3251 gphi
*phi
= gsi
.phi ();
3252 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3253 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3254 maybe_push_to_hybrid_worklist (loop_vinfo
,
3255 worklist
, stmt_info
);
3257 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3260 gimple
*stmt
= gsi_stmt (gsi
);
3261 if (is_gimple_debug (stmt
))
3263 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3264 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3266 for (gimple_stmt_iterator gsi2
3267 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3268 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3270 stmt_vec_info patt_info
3271 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3272 if (!STMT_SLP_TYPE (patt_info
)
3273 && STMT_VINFO_RELEVANT (patt_info
))
3274 maybe_push_to_hybrid_worklist (loop_vinfo
,
3275 worklist
, patt_info
);
3277 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3279 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3280 maybe_push_to_hybrid_worklist (loop_vinfo
,
3281 worklist
, stmt_info
);
3285 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3286 mark any SLP vectorized stmt as hybrid.
3287 ??? We're visiting def stmts N times (once for each non-SLP and
3288 once for each hybrid-SLP use). */
3291 dat
.worklist
= &worklist
;
3292 dat
.loop_vinfo
= loop_vinfo
;
3293 memset (&wi
, 0, sizeof (wi
));
3294 wi
.info
= (void *)&dat
;
3295 while (!worklist
.is_empty ())
3297 stmt_vec_info stmt_info
= worklist
.pop ();
3298 /* Since SSA operands are not set up for pattern stmts we need
3299 to use walk_gimple_op. */
3301 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3306 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3308 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3309 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
)
3311 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3314 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3317 gphi
*phi
= si
.phi ();
3318 gimple_set_uid (phi
, 0);
3321 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3322 !gsi_end_p (gsi
); gsi_next (&gsi
))
3324 gimple
*stmt
= gsi_stmt (gsi
);
3325 gimple_set_uid (stmt
, 0);
3326 if (is_gimple_debug (stmt
))
3334 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3335 stmts in the basic block. */
3337 _bb_vec_info::~_bb_vec_info ()
3339 /* Reset region marker. */
3340 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3343 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3346 gphi
*phi
= si
.phi ();
3347 gimple_set_uid (phi
, -1);
3349 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3350 !gsi_end_p (gsi
); gsi_next (&gsi
))
3352 gimple
*stmt
= gsi_stmt (gsi
);
3353 gimple_set_uid (stmt
, -1);
3358 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3359 given then that child nodes have already been processed, and that
3360 their def types currently match their SLP node's def type. */
3363 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3364 slp_instance node_instance
,
3365 stmt_vector_for_cost
*cost_vec
)
3367 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3368 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3370 /* Calculate the number of vector statements to be created for the
3371 scalar stmts in this node. For SLP reductions it is equal to the
3372 number of vector statements in the children (which has already been
3373 calculated by the recursive call). Otherwise it is the number of
3374 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3375 VF divided by the number of elements in a vector. */
3376 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3377 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3379 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3380 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3382 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3383 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3390 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3391 vf
= loop_vinfo
->vectorization_factor
;
3394 unsigned int group_size
= SLP_TREE_LANES (node
);
3395 tree vectype
= SLP_TREE_VECTYPE (node
);
3396 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3397 = vect_get_num_vectors (vf
* group_size
, vectype
);
3400 /* Handle purely internal nodes. */
3401 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3402 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3404 if (is_a
<bb_vec_info
> (vinfo
)
3405 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3407 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3409 "desired vector type conflicts with earlier one "
3410 "for %G", stmt_info
->stmt
);
3415 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3416 node
, node_instance
, cost_vec
);
3419 /* Try to build NODE from scalars, returning true on success.
3420 NODE_INSTANCE is the SLP instance that contains NODE. */
3423 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3424 slp_instance node_instance
)
3426 stmt_vec_info stmt_info
;
3429 if (!is_a
<bb_vec_info
> (vinfo
)
3430 || node
== SLP_INSTANCE_TREE (node_instance
)
3431 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3432 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3435 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_NOTE
, vect_location
,
3437 "Building vector operands of %p from scalars instead\n", node
);
3439 /* Don't remove and free the child nodes here, since they could be
3440 referenced by other structures. The analysis and scheduling phases
3441 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3442 unsigned int group_size
= SLP_TREE_LANES (node
);
3443 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3444 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3445 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3446 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3448 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3449 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3454 /* Compute the prologue cost for invariant or constant operands represented
3458 vect_prologue_cost_for_slp (slp_tree node
,
3459 stmt_vector_for_cost
*cost_vec
)
3461 /* There's a special case of an existing vector, that costs nothing. */
3462 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3463 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3465 /* Without looking at the actual initializer a vector of
3466 constants can be implemented as load from the constant pool.
3467 When all elements are the same we can use a splat. */
3468 tree vectype
= SLP_TREE_VECTYPE (node
);
3469 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3470 unsigned num_vects_to_check
;
3471 unsigned HOST_WIDE_INT const_nunits
;
3472 unsigned nelt_limit
;
3473 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3474 && ! multiple_p (const_nunits
, group_size
))
3476 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3477 nelt_limit
= const_nunits
;
3481 /* If either the vector has variable length or the vectors
3482 are composed of repeated whole groups we only need to
3483 cost construction once. All vectors will be the same. */
3484 num_vects_to_check
= 1;
3485 nelt_limit
= group_size
;
3487 tree elt
= NULL_TREE
;
3489 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3491 unsigned si
= j
% group_size
;
3493 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3494 /* ??? We're just tracking whether all operands of a single
3495 vector initializer are the same, ideally we'd check if
3496 we emitted the same one already. */
3497 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3500 if (nelt
== nelt_limit
)
3502 record_stmt_cost (cost_vec
, 1,
3503 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3504 ? (elt
? scalar_to_vec
: vec_construct
)
3506 NULL
, vectype
, 0, vect_prologue
);
3512 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3513 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3515 Return true if the operations are supported. */
3518 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3519 slp_instance node_instance
,
3520 hash_set
<slp_tree
> &visited_set
,
3521 vec
<slp_tree
> &visited_vec
,
3522 stmt_vector_for_cost
*cost_vec
)
3527 /* Assume we can code-generate all invariants. */
3529 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3530 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3533 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3535 if (dump_enabled_p ())
3536 dump_printf_loc (MSG_NOTE
, vect_location
,
3537 "Failed cyclic SLP reference in %p", node
);
3540 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3542 /* If we already analyzed the exact same set of scalar stmts we're done.
3543 We share the generated vector stmts for those. */
3544 if (visited_set
.add (node
))
3546 visited_vec
.safe_push (node
);
3549 unsigned visited_rec_start
= visited_vec
.length ();
3550 unsigned cost_vec_rec_start
= cost_vec
->length ();
3551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3553 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3554 visited_set
, visited_vec
,
3561 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3563 /* If analysis failed we have to pop all recursive visited nodes
3567 while (visited_vec
.length () >= visited_rec_start
)
3568 visited_set
.remove (visited_vec
.pop ());
3569 cost_vec
->truncate (cost_vec_rec_start
);
3572 /* When the node can be vectorized cost invariant nodes it references.
3573 This is not done in DFS order to allow the refering node
3574 vectorizable_* calls to nail down the invariant nodes vector type
3575 and possibly unshare it if it needs a different vector type than
3578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3580 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3581 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3582 /* Perform usual caching, note code-generation still
3583 code-gens these nodes multiple times but we expect
3584 to CSE them later. */
3585 && !visited_set
.add (child
))
3587 visited_vec
.safe_push (child
);
3588 /* ??? After auditing more code paths make a "default"
3589 and push the vector type from NODE to all children
3590 if it is not already set. */
3591 /* Compute the number of vectors to be generated. */
3592 tree vector_type
= SLP_TREE_VECTYPE (child
);
3595 /* For shifts with a scalar argument we don't need
3596 to cost or code-generate anything.
3597 ??? Represent this more explicitely. */
3598 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3599 == shift_vec_info_type
)
3603 unsigned group_size
= SLP_TREE_LANES (child
);
3605 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3606 vf
= loop_vinfo
->vectorization_factor
;
3607 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3608 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3609 /* And cost them. */
3610 vect_prologue_cost_for_slp (child
, cost_vec
);
3613 /* If this node or any of its children can't be vectorized, try pruning
3614 the tree here rather than felling the whole thing. */
3615 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3617 /* We'll need to revisit this for invariant costing and number
3618 of vectorized stmt setting. */
3626 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3627 region and that can be vectorized using vectorizable_live_operation
3628 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3629 scalar code computing it to be retained. */
3632 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3633 slp_instance instance
,
3634 stmt_vector_for_cost
*cost_vec
,
3635 hash_set
<stmt_vec_info
> &svisited
,
3636 hash_set
<slp_tree
> &visited
)
3638 if (visited
.add (node
))
3642 stmt_vec_info stmt_info
;
3643 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3644 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3646 if (svisited
.contains (stmt_info
))
3648 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3649 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3650 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3651 /* Only the pattern root stmt computes the original scalar value. */
3653 bool mark_visited
= true;
3654 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3655 ssa_op_iter op_iter
;
3656 def_operand_p def_p
;
3657 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3659 imm_use_iterator use_iter
;
3661 stmt_vec_info use_stmt_info
;
3662 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3663 if (!is_gimple_debug (use_stmt
))
3665 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3667 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3669 STMT_VINFO_LIVE_P (stmt_info
) = true;
3670 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3671 NULL
, node
, instance
, i
,
3673 /* ??? So we know we can vectorize the live stmt
3674 from one SLP node. If we cannot do so from all
3675 or none consistently we'd have to record which
3676 SLP node (and lane) we want to use for the live
3677 operation. So make sure we can code-generate
3679 mark_visited
= false;
3681 STMT_VINFO_LIVE_P (stmt_info
) = false;
3682 BREAK_FROM_IMM_USE_STMT (use_iter
);
3685 /* We have to verify whether we can insert the lane extract
3686 before all uses. The following is a conservative approximation.
3687 We cannot put this into vectorizable_live_operation because
3688 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3690 Note that while the fact that we emit code for loads at the
3691 first load should make this a non-problem leafs we construct
3692 from scalars are vectorized after the last scalar def.
3693 ??? If we'd actually compute the insert location during
3694 analysis we could use sth less conservative than the last
3695 scalar stmt in the node for the dominance check. */
3696 /* ??? What remains is "live" uses in vector CTORs in the same
3697 SLP graph which is where those uses can end up code-generated
3698 right after their definition instead of close to their original
3699 use. But that would restrict us to code-generate lane-extracts
3700 from the latest stmt in a node. So we compensate for this
3701 during code-generation, simply not replacing uses for those
3702 hopefully rare cases. */
3703 if (STMT_VINFO_LIVE_P (stmt_info
))
3704 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3705 if (!is_gimple_debug (use_stmt
)
3706 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3707 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3708 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3710 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3712 "Cannot determine insertion place for "
3714 STMT_VINFO_LIVE_P (stmt_info
) = false;
3715 mark_visited
= true;
3719 svisited
.add (stmt_info
);
3723 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3724 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3725 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3726 cost_vec
, svisited
, visited
);
3729 /* Analyze statements in SLP instances of VINFO. Return true if the
3730 operations are supported. */
3733 vect_slp_analyze_operations (vec_info
*vinfo
)
3735 slp_instance instance
;
3738 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3740 hash_set
<slp_tree
> visited
;
3741 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3743 auto_vec
<slp_tree
> visited_vec
;
3744 stmt_vector_for_cost cost_vec
;
3745 cost_vec
.create (2);
3746 if (is_a
<bb_vec_info
> (vinfo
))
3747 vect_location
= instance
->location ();
3748 if (!vect_slp_analyze_node_operations (vinfo
,
3749 SLP_INSTANCE_TREE (instance
),
3750 instance
, visited
, visited_vec
,
3752 /* Instances with a root stmt require vectorized defs for the
3754 || (SLP_INSTANCE_ROOT_STMT (instance
)
3755 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3756 != vect_internal_def
)))
3758 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3759 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3760 if (dump_enabled_p ())
3761 dump_printf_loc (MSG_NOTE
, vect_location
,
3762 "removing SLP instance operations starting from: %G",
3764 vect_free_slp_instance (instance
);
3765 vinfo
->slp_instances
.ordered_remove (i
);
3766 cost_vec
.release ();
3767 while (!visited_vec
.is_empty ())
3768 visited
.remove (visited_vec
.pop ());
3774 /* For BB vectorization remember the SLP graph entry
3776 if (is_a
<bb_vec_info
> (vinfo
))
3777 instance
->cost_vec
= cost_vec
;
3780 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3781 cost_vec
.release ();
3786 /* Compute vectorizable live stmts. */
3787 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3789 hash_set
<stmt_vec_info
> svisited
;
3790 hash_set
<slp_tree
> visited
;
3791 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3793 vect_location
= instance
->location ();
3794 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3795 instance
, &instance
->cost_vec
, svisited
,
3800 return !vinfo
->slp_instances
.is_empty ();
3803 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3804 closing the eventual chain. */
3807 get_ultimate_leader (slp_instance instance
,
3808 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3810 auto_vec
<slp_instance
*, 8> chain
;
3812 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3814 chain
.safe_push (tem
);
3817 while (!chain
.is_empty ())
3818 *chain
.pop () = instance
;
3822 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3825 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3826 slp_instance instance
, slp_tree node
,
3827 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3828 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
3829 hash_set
<slp_tree
> &visited
)
3831 stmt_vec_info stmt_info
;
3834 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3837 slp_instance
&stmt_instance
3838 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3841 else if (stmt_instance
!= instance
)
3843 /* If we're running into a previously marked stmt make us the
3844 leader of the current ultimate leader. This keeps the
3845 leader chain acyclic and works even when the current instance
3846 connects two previously independent graph parts. */
3847 slp_instance stmt_leader
3848 = get_ultimate_leader (stmt_instance
, instance_leader
);
3849 if (stmt_leader
!= instance
)
3850 instance_leader
.put (stmt_leader
, instance
);
3852 stmt_instance
= instance
;
3855 if (visited
.add (node
))
3859 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3860 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3861 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3862 instance_leader
, visited
);
3865 /* Partition the SLP graph into pieces that can be costed independently. */
3868 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3870 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3872 /* First walk the SLP graph assigning each involved scalar stmt a
3873 corresponding SLP graph entry and upon visiting a previously
3874 marked stmt, make the stmts leader the current SLP graph entry. */
3875 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3876 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3877 hash_set
<slp_tree
> visited
;
3878 slp_instance instance
;
3879 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3881 instance_leader
.put (instance
, instance
);
3882 vect_bb_partition_graph_r (bb_vinfo
,
3883 instance
, SLP_INSTANCE_TREE (instance
),
3884 stmt_to_instance
, instance_leader
,
3888 /* Then collect entries to each independent subgraph. */
3889 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3891 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3892 leader
->subgraph_entries
.safe_push (instance
);
3893 if (dump_enabled_p ()
3894 && leader
!= instance
)
3895 dump_printf_loc (MSG_NOTE
, vect_location
,
3896 "instance %p is leader of %p\n",
3901 /* Compute the scalar cost of the SLP node NODE and its children
3902 and return it. Do not account defs that are marked in LIFE and
3903 update LIFE according to uses of NODE. */
3906 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3907 slp_tree node
, vec
<bool, va_heap
> *life
,
3908 stmt_vector_for_cost
*cost_vec
,
3909 hash_set
<slp_tree
> &visited
)
3912 stmt_vec_info stmt_info
;
3915 if (visited
.add (node
))
3918 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3920 ssa_op_iter op_iter
;
3921 def_operand_p def_p
;
3926 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3927 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3929 /* If there is a non-vectorized use of the defs then the scalar
3930 stmt is kept live in which case we do not account it or any
3931 required defs in the SLP children in the scalar cost. This
3932 way we make the vectorization more costly when compared to
3934 if (!STMT_VINFO_LIVE_P (stmt_info
))
3936 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3938 imm_use_iterator use_iter
;
3940 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3941 if (!is_gimple_debug (use_stmt
))
3943 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
3946 (vect_stmt_to_vectorize (use_stmt_info
)))
3949 BREAK_FROM_IMM_USE_STMT (use_iter
);
3957 /* Count scalar stmts only once. */
3958 if (gimple_visited_p (orig_stmt
))
3960 gimple_set_visited (orig_stmt
, true);
3962 vect_cost_for_stmt kind
;
3963 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
3965 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
3968 kind
= scalar_store
;
3970 else if (vect_nop_conversion_p (orig_stmt_info
))
3974 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
3975 SLP_TREE_VECTYPE (node
), 0, vect_body
);
3978 auto_vec
<bool, 20> subtree_life
;
3979 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3981 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3983 /* Do not directly pass LIFE to the recursive call, copy it to
3984 confine changes in the callee to the current child/subtree. */
3985 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3987 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
3988 for (unsigned j
= 0;
3989 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3991 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3992 if (perm
.first
== i
)
3993 subtree_life
[perm
.second
] = (*life
)[j
];
3998 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3999 subtree_life
.safe_splice (*life
);
4001 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4003 subtree_life
.truncate (0);
4008 /* Check if vectorization of the basic block is profitable for the
4009 subgraph denoted by SLP_INSTANCES. */
4012 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4013 vec
<slp_instance
> slp_instances
)
4015 slp_instance instance
;
4017 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4018 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4020 void *vect_target_cost_data
= init_cost (NULL
);
4022 /* Calculate scalar cost and sum the cost for the vector stmts
4023 previously collected. */
4024 stmt_vector_for_cost scalar_costs
;
4025 scalar_costs
.create (0);
4026 hash_set
<slp_tree
> visited
;
4027 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4029 auto_vec
<bool, 20> life
;
4030 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4032 vect_bb_slp_scalar_cost (bb_vinfo
,
4033 SLP_INSTANCE_TREE (instance
),
4034 &life
, &scalar_costs
, visited
);
4035 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
4036 instance
->cost_vec
.release ();
4038 /* Unset visited flag. */
4039 stmt_info_for_cost
*si
;
4040 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
4041 gimple_set_visited (si
->stmt_info
->stmt
, false);
4043 void *scalar_target_cost_data
= init_cost (NULL
);
4044 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
4045 scalar_costs
.release ();
4047 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4048 destroy_cost_data (scalar_target_cost_data
);
4050 /* Complete the target-specific vector cost calculation. */
4051 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4052 &vec_inside_cost
, &vec_epilogue_cost
);
4053 destroy_cost_data (vect_target_cost_data
);
4055 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4057 if (dump_enabled_p ())
4059 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4060 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
4062 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
4063 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
4064 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
4067 /* Vectorization is profitable if its cost is more than the cost of scalar
4068 version. Note that we err on the vector side for equal cost because
4069 the cost estimate is otherwise quite pessimistic (constant uses are
4070 free on the scalar side but cost a load on the vector side for
4072 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4078 /* Find any vectorizable constructors and add them to the grouped_store
4082 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4084 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4085 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4086 !gsi_end_p (gsi
); gsi_next (&gsi
))
4088 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4089 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
4092 tree rhs
= gimple_assign_rhs1 (assign
);
4093 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4094 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4095 CONSTRUCTOR_NELTS (rhs
))
4096 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4097 || uniform_vector_p (rhs
))
4102 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4103 if (TREE_CODE (val
) != SSA_NAME
4104 || !bb_vinfo
->lookup_def (val
))
4106 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4109 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4110 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4114 /* Walk the grouped store chains and replace entries with their
4115 pattern variant if any. */
4118 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4120 stmt_vec_info first_element
;
4123 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4125 /* We also have CTORs in this array. */
4126 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4128 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4130 stmt_vec_info orig
= first_element
;
4131 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4132 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4133 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4134 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4135 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4136 vinfo
->grouped_stores
[i
] = first_element
;
4138 stmt_vec_info prev
= first_element
;
4139 while (DR_GROUP_NEXT_ELEMENT (prev
))
4141 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4142 if (STMT_VINFO_IN_PATTERN_P (elt
))
4144 stmt_vec_info orig
= elt
;
4145 elt
= STMT_VINFO_RELATED_STMT (elt
);
4146 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4147 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4148 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4150 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4156 /* Check if the region described by BB_VINFO can be vectorized, returning
4157 true if so. When returning false, set FATAL to true if the same failure
4158 would prevent vectorization at other vector sizes, false if it is still
4159 worth trying other sizes. N_STMTS is the number of statements in the
4163 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4164 vec
<int> *dataref_groups
)
4166 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4168 slp_instance instance
;
4170 poly_uint64 min_vf
= 2;
4172 /* The first group of checks is independent of the vector size. */
4175 /* Analyze the data references. */
4177 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4179 if (dump_enabled_p ())
4180 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4181 "not vectorized: unhandled data-ref in basic "
4186 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4188 if (dump_enabled_p ())
4189 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4190 "not vectorized: unhandled data access in "
4195 vect_slp_check_for_constructors (bb_vinfo
);
4197 /* If there are no grouped stores and no constructors in the region
4198 there is no need to continue with pattern recog as vect_analyze_slp
4199 will fail anyway. */
4200 if (bb_vinfo
->grouped_stores
.is_empty ())
4202 if (dump_enabled_p ())
4203 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4204 "not vectorized: no grouped stores in "
4209 /* While the rest of the analysis below depends on it in some way. */
4212 vect_pattern_recog (bb_vinfo
);
4214 /* Update store groups from pattern processing. */
4215 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4217 /* Check the SLP opportunities in the basic block, analyze and build SLP
4219 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4221 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4224 "Failed to SLP the basic block.\n");
4225 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4226 "not vectorized: failed to find SLP opportunities "
4227 "in basic block.\n");
4232 /* Optimize permutations. */
4233 vect_optimize_slp (bb_vinfo
);
4235 /* Gather the loads reachable from the SLP graph entries. */
4236 vect_gather_slp_loads (bb_vinfo
);
4238 vect_record_base_alignments (bb_vinfo
);
4240 /* Analyze and verify the alignment of data references and the
4241 dependence in the SLP instances. */
4242 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4244 vect_location
= instance
->location ();
4245 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4246 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4248 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4249 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4250 if (dump_enabled_p ())
4251 dump_printf_loc (MSG_NOTE
, vect_location
,
4252 "removing SLP instance operations starting from: %G",
4254 vect_free_slp_instance (instance
);
4255 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4259 /* Mark all the statements that we want to vectorize as pure SLP and
4261 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4262 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4263 if (SLP_INSTANCE_ROOT_STMT (instance
))
4264 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
4268 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4271 if (!vect_slp_analyze_operations (bb_vinfo
))
4273 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4275 "not vectorized: bad operation in basic block.\n");
4279 vect_bb_partition_graph (bb_vinfo
);
4284 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4285 basic blocks in BBS, returning true on success.
4286 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4289 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4290 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4292 bb_vec_info bb_vinfo
;
4293 auto_vector_modes vector_modes
;
4295 /* Autodetect first vector size we try. */
4296 machine_mode next_vector_mode
= VOIDmode
;
4297 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4298 unsigned int mode_i
= 0;
4300 vec_info_shared shared
;
4302 machine_mode autodetected_vector_mode
= VOIDmode
;
4305 bool vectorized
= false;
4307 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4309 bool first_time_p
= shared
.datarefs
.is_empty ();
4310 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4312 bb_vinfo
->shared
->save_datarefs ();
4314 bb_vinfo
->shared
->check_datarefs ();
4315 bb_vinfo
->vector_mode
= next_vector_mode
;
4317 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
4318 && dbg_cnt (vect_slp
))
4320 if (dump_enabled_p ())
4322 dump_printf_loc (MSG_NOTE
, vect_location
,
4323 "***** Analysis succeeded with vector mode"
4324 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4325 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4328 bb_vinfo
->shared
->check_datarefs ();
4331 slp_instance instance
;
4332 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4334 if (instance
->subgraph_entries
.is_empty ())
4337 vect_location
= instance
->location ();
4338 if (!unlimited_cost_model (NULL
)
4339 && !vect_bb_vectorization_profitable_p
4340 (bb_vinfo
, instance
->subgraph_entries
))
4342 if (dump_enabled_p ())
4343 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4344 "not vectorized: vectorization is not "
4349 if (!vectorized
&& dump_enabled_p ())
4350 dump_printf_loc (MSG_NOTE
, vect_location
,
4351 "Basic block will be vectorized "
4355 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4357 unsigned HOST_WIDE_INT bytes
;
4358 if (dump_enabled_p ())
4361 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4362 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4363 "basic block part vectorized using %wu "
4364 "byte vectors\n", bytes
);
4366 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4367 "basic block part vectorized using "
4368 "variable length vectors\n");
4374 if (dump_enabled_p ())
4375 dump_printf_loc (MSG_NOTE
, vect_location
,
4376 "***** Analysis failed with vector mode %s\n",
4377 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4381 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4384 while (mode_i
< vector_modes
.length ()
4385 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4387 if (dump_enabled_p ())
4388 dump_printf_loc (MSG_NOTE
, vect_location
,
4389 "***** The result for vector mode %s would"
4391 GET_MODE_NAME (vector_modes
[mode_i
]));
4397 if (mode_i
< vector_modes
.length ()
4398 && VECTOR_MODE_P (autodetected_vector_mode
)
4399 && (related_vector_mode (vector_modes
[mode_i
],
4400 GET_MODE_INNER (autodetected_vector_mode
))
4401 == autodetected_vector_mode
)
4402 && (related_vector_mode (autodetected_vector_mode
,
4403 GET_MODE_INNER (vector_modes
[mode_i
]))
4404 == vector_modes
[mode_i
]))
4406 if (dump_enabled_p ())
4407 dump_printf_loc (MSG_NOTE
, vect_location
,
4408 "***** Skipping vector mode %s, which would"
4409 " repeat the analysis for %s\n",
4410 GET_MODE_NAME (vector_modes
[mode_i
]),
4411 GET_MODE_NAME (autodetected_vector_mode
));
4416 || mode_i
== vector_modes
.length ()
4417 || autodetected_vector_mode
== VOIDmode
4418 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4419 vector sizes will fail do not bother iterating. */
4423 /* Try the next biggest vector size. */
4424 next_vector_mode
= vector_modes
[mode_i
++];
4425 if (dump_enabled_p ())
4426 dump_printf_loc (MSG_NOTE
, vect_location
,
4427 "***** Re-trying analysis with vector mode %s\n",
4428 GET_MODE_NAME (next_vector_mode
));
4433 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4434 true if anything in the basic-block was vectorized. */
4437 vect_slp_bbs (vec
<basic_block
> bbs
)
4439 vec
<data_reference_p
> datarefs
= vNULL
;
4440 auto_vec
<int> dataref_groups
;
4442 int current_group
= 0;
4444 for (unsigned i
= 0; i
< bbs
.length (); i
++)
4446 basic_block bb
= bbs
[i
];
4447 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
4450 gimple
*stmt
= gsi_stmt (gsi
);
4451 if (is_gimple_debug (stmt
))
4456 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
4457 vect_location
= stmt
;
4459 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
4460 &dataref_groups
, current_group
))
4465 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
4468 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4469 true if anything in the basic-block was vectorized. */
4472 vect_slp_bb (basic_block bb
)
4474 auto_vec
<basic_block
> bbs
;
4476 return vect_slp_bbs (bbs
);
4479 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4480 true if anything in the basic-block was vectorized. */
4483 vect_slp_function (function
*fun
)
4486 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
4487 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
4489 /* For the moment split the function into pieces to avoid making
4490 the iteration on the vector mode moot. Split at points we know
4491 to not handle well which is CFG merges (SLP discovery doesn't
4492 handle non-loop-header PHIs) and loop exits. Since pattern
4493 recog requires reverse iteration to visit uses before defs
4494 simply chop RPO into pieces. */
4495 auto_vec
<basic_block
> bbs
;
4496 for (unsigned i
= 0; i
< n
; i
++)
4498 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
4501 /* Split when a BB is not dominated by the first block. */
4502 if (!bbs
.is_empty ()
4503 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
4505 if (dump_enabled_p ())
4506 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4507 "splitting region at dominance boundary bb%d\n",
4511 /* Split when the loop determined by the first block
4512 is exited. This is because we eventually insert
4513 invariants at region begin. */
4514 else if (!bbs
.is_empty ()
4515 && bbs
[0]->loop_father
!= bb
->loop_father
4516 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
4518 if (dump_enabled_p ())
4519 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4520 "splitting region at loop %d exit at bb%d\n",
4521 bbs
[0]->loop_father
->num
, bb
->index
);
4525 if (split
&& !bbs
.is_empty ())
4527 r
|= vect_slp_bbs (bbs
);
4529 bbs
.quick_push (bb
);
4534 /* When we have a stmt ending this block and defining a
4535 value we have to insert on edges when inserting after it for
4536 a vector containing its definition. Avoid this for now. */
4537 if (gimple
*last
= last_stmt (bb
))
4538 if (gimple_get_lhs (last
)
4539 && is_ctrl_altering_stmt (last
))
4541 if (dump_enabled_p ())
4542 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4543 "splitting region at control altering "
4544 "definition %G", last
);
4545 r
|= vect_slp_bbs (bbs
);
4550 if (!bbs
.is_empty ())
4551 r
|= vect_slp_bbs (bbs
);
4558 /* Build a variable-length vector in which the elements in ELTS are repeated
4559 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4560 RESULTS and add any new instructions to SEQ.
4562 The approach we use is:
4564 (1) Find a vector mode VM with integer elements of mode IM.
4566 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4567 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4568 from small vectors to IM.
4570 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4572 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4573 correct byte contents.
4575 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4577 We try to find the largest IM for which this sequence works, in order
4578 to cut down on the number of interleaves. */
4581 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
4582 vec
<tree
> elts
, unsigned int nresults
,
4585 unsigned int nelts
= elts
.length ();
4586 tree element_type
= TREE_TYPE (vector_type
);
4588 /* (1) Find a vector mode VM with integer elements of mode IM. */
4589 unsigned int nvectors
= 1;
4590 tree new_vector_type
;
4592 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
4593 &nvectors
, &new_vector_type
,
4597 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4598 unsigned int partial_nelts
= nelts
/ nvectors
;
4599 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
4601 tree_vector_builder partial_elts
;
4602 auto_vec
<tree
, 32> pieces (nvectors
* 2);
4603 pieces
.quick_grow (nvectors
* 2);
4604 for (unsigned int i
= 0; i
< nvectors
; ++i
)
4606 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4607 ELTS' has mode IM. */
4608 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
4609 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
4610 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
4611 tree t
= gimple_build_vector (seq
, &partial_elts
);
4612 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
4613 TREE_TYPE (new_vector_type
), t
);
4615 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4616 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
4619 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4620 correct byte contents.
4622 We need to repeat the following operation log2(nvectors) times:
4624 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4625 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4627 However, if each input repeats every N elements and the VF is
4628 a multiple of N * 2, the HI result is the same as the LO. */
4629 unsigned int in_start
= 0;
4630 unsigned int out_start
= nvectors
;
4631 unsigned int hi_start
= nvectors
/ 2;
4632 /* A bound on the number of outputs needed to produce NRESULTS results
4633 in the final iteration. */
4634 unsigned int noutputs_bound
= nvectors
* nresults
;
4635 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
4637 noutputs_bound
/= 2;
4638 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
4639 for (unsigned int i
= 0; i
< limit
; ++i
)
4642 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
4645 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
4649 tree output
= make_ssa_name (new_vector_type
);
4650 tree input1
= pieces
[in_start
+ (i
/ 2)];
4651 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
4652 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
4655 gimple_seq_add_stmt (seq
, stmt
);
4656 pieces
[out_start
+ i
] = output
;
4658 std::swap (in_start
, out_start
);
4661 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4662 results
.reserve (nresults
);
4663 for (unsigned int i
= 0; i
< nresults
; ++i
)
4665 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
4666 pieces
[in_start
+ i
]));
4668 results
.quick_push (results
[i
- nvectors
]);
4672 /* For constant and loop invariant defs in OP_NODE this function creates
4673 vector defs that will be used in the vectorized stmts and stores them
4674 to SLP_TREE_VEC_DEFS of OP_NODE. */
4677 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
4679 unsigned HOST_WIDE_INT nunits
;
4681 unsigned j
, number_of_places_left_in_vector
;
4684 int group_size
= op_node
->ops
.length ();
4685 unsigned int vec_num
, i
;
4686 unsigned number_of_copies
= 1;
4688 gimple_seq ctor_seq
= NULL
;
4689 auto_vec
<tree
, 16> permute_results
;
4691 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4692 vector_type
= SLP_TREE_VECTYPE (op_node
);
4694 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
4695 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
4696 auto_vec
<tree
> voprnds (number_of_vectors
);
4698 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4699 created vectors. It is greater than 1 if unrolling is performed.
4701 For example, we have two scalar operands, s1 and s2 (e.g., group of
4702 strided accesses of size two), while NUNITS is four (i.e., four scalars
4703 of this type can be packed in a vector). The output vector will contain
4704 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4707 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4708 containing the operands.
4710 For example, NUNITS is four as before, and the group size is 8
4711 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4712 {s5, s6, s7, s8}. */
4714 /* When using duplicate_and_interleave, we just need one element for
4715 each scalar statement. */
4716 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4717 nunits
= group_size
;
4719 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4721 number_of_places_left_in_vector
= nunits
;
4723 tree_vector_builder
elts (vector_type
, nunits
, 1);
4724 elts
.quick_grow (nunits
);
4725 stmt_vec_info insert_after
= NULL
;
4726 for (j
= 0; j
< number_of_copies
; j
++)
4729 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
4731 /* Create 'vect_ = {op0,op1,...,opn}'. */
4732 number_of_places_left_in_vector
--;
4734 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
4736 if (CONSTANT_CLASS_P (op
))
4738 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4740 /* Can't use VIEW_CONVERT_EXPR for booleans because
4741 of possibly different sizes of scalar value and
4743 if (integer_zerop (op
))
4744 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
4745 else if (integer_onep (op
))
4746 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
4751 op
= fold_unary (VIEW_CONVERT_EXPR
,
4752 TREE_TYPE (vector_type
), op
);
4753 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
4757 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
4759 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4762 = build_all_ones_cst (TREE_TYPE (vector_type
));
4764 = build_zero_cst (TREE_TYPE (vector_type
));
4765 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
4766 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4772 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4775 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4778 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4782 elts
[number_of_places_left_in_vector
] = op
;
4783 if (!CONSTANT_CLASS_P (op
))
4785 /* For BB vectorization we have to compute an insert location
4786 when a def is inside the analyzed region since we cannot
4787 simply insert at the BB start in this case. */
4788 stmt_vec_info opdef
;
4789 if (TREE_CODE (orig_op
) == SSA_NAME
4790 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4791 && is_a
<bb_vec_info
> (vinfo
)
4792 && (opdef
= vinfo
->lookup_def (orig_op
)))
4795 insert_after
= opdef
;
4797 insert_after
= get_later_stmt (insert_after
, opdef
);
4800 if (number_of_places_left_in_vector
== 0)
4803 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4804 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4805 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
4808 if (permute_results
.is_empty ())
4809 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
4810 elts
, number_of_vectors
,
4812 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
4814 if (!gimple_seq_empty_p (ctor_seq
))
4818 gimple_stmt_iterator gsi
;
4819 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
4821 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
4822 gsi_insert_seq_before (&gsi
, ctor_seq
,
4823 GSI_CONTINUE_LINKING
);
4825 else if (!stmt_ends_bb_p (insert_after
->stmt
))
4827 gsi
= gsi_for_stmt (insert_after
->stmt
);
4828 gsi_insert_seq_after (&gsi
, ctor_seq
,
4829 GSI_CONTINUE_LINKING
);
4833 /* When we want to insert after a def where the
4834 defining stmt throws then insert on the fallthru
4836 edge e
= find_fallthru_edge
4837 (gimple_bb (insert_after
->stmt
)->succs
);
4839 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
4840 gcc_assert (!new_bb
);
4844 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
4847 voprnds
.quick_push (vec_cst
);
4848 insert_after
= NULL
;
4849 number_of_places_left_in_vector
= nunits
;
4851 elts
.new_vector (vector_type
, nunits
, 1);
4852 elts
.quick_grow (nunits
);
4857 /* Since the vectors are created in the reverse order, we should invert
4859 vec_num
= voprnds
.length ();
4860 for (j
= vec_num
; j
!= 0; j
--)
4862 vop
= voprnds
[j
- 1];
4863 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4866 /* In case that VF is greater than the unrolling factor needed for the SLP
4867 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4868 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4869 to replicate the vectors. */
4870 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
4871 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
4873 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4876 /* Get the Ith vectorized definition from SLP_NODE. */
4879 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
4881 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
4882 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
4884 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
4887 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4890 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
4892 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
4893 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
4896 gimple
*vec_def_stmt
;
4897 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
4898 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
4901 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
4904 /* Get N vectorized definitions for SLP_NODE. */
4907 vect_get_slp_defs (vec_info
*,
4908 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
4911 n
= SLP_TREE_CHILDREN (slp_node
).length ();
4913 for (unsigned i
= 0; i
< n
; ++i
)
4915 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
4916 vec
<tree
> vec_defs
= vNULL
;
4917 vect_get_slp_defs (child
, &vec_defs
);
4918 vec_oprnds
->quick_push (vec_defs
);
4922 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4923 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4924 permute statements for the SLP node NODE. Store the number of vector
4925 permute instructions in *N_PERMS and the number of vector load
4926 instructions in *N_LOADS. */
4929 vect_transform_slp_perm_load (vec_info
*vinfo
,
4930 slp_tree node
, vec
<tree
> dr_chain
,
4931 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
4932 bool analyze_only
, unsigned *n_perms
,
4933 unsigned int *n_loads
)
4935 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4937 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4938 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
4939 unsigned int mask_element
;
4942 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4945 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4947 mode
= TYPE_MODE (vectype
);
4948 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4950 /* Initialize the vect stmts of NODE to properly insert the generated
4953 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
4954 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
4955 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
4957 /* Generate permutation masks for every NODE. Number of masks for each NODE
4958 is equal to GROUP_SIZE.
4959 E.g., we have a group of three nodes with three loads from the same
4960 location in each node, and the vector size is 4. I.e., we have a
4961 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4962 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4963 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4966 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4967 The last mask is illegal since we assume two operands for permute
4968 operation, and the mask element values can't be outside that range.
4969 Hence, the last mask must be converted into {2,5,5,5}.
4970 For the first two permutations we need the first and the second input
4971 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4972 we need the second and the third vectors: {b1,c1,a2,b2} and
4975 int vect_stmts_counter
= 0;
4976 unsigned int index
= 0;
4977 int first_vec_index
= -1;
4978 int second_vec_index
= -1;
4982 vec_perm_builder mask
;
4983 unsigned int nelts_to_build
;
4984 unsigned int nvectors_per_build
;
4985 unsigned int in_nlanes
;
4986 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
4987 && multiple_p (nunits
, group_size
));
4990 /* A single vector contains a whole number of copies of the node, so:
4991 (a) all permutes can use the same mask; and
4992 (b) the permutes only need a single vector input. */
4993 mask
.new_vector (nunits
, group_size
, 3);
4994 nelts_to_build
= mask
.encoded_nelts ();
4995 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
4996 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5000 /* We need to construct a separate mask for each vector statement. */
5001 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5002 if (!nunits
.is_constant (&const_nunits
)
5003 || !vf
.is_constant (&const_vf
))
5005 mask
.new_vector (const_nunits
, const_nunits
, 1);
5006 nelts_to_build
= const_vf
* group_size
;
5007 nvectors_per_build
= 1;
5008 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5010 auto_sbitmap
used_in_lanes (in_nlanes
);
5011 bitmap_clear (used_in_lanes
);
5013 unsigned int count
= mask
.encoded_nelts ();
5014 mask
.quick_grow (count
);
5015 vec_perm_indices indices
;
5017 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5019 unsigned int iter_num
= j
/ group_size
;
5020 unsigned int stmt_num
= j
% group_size
;
5021 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5022 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5023 bitmap_set_bit (used_in_lanes
, i
);
5026 first_vec_index
= 0;
5031 /* Enforced before the loop when !repeating_p. */
5032 unsigned int const_nunits
= nunits
.to_constant ();
5033 vec_index
= i
/ const_nunits
;
5034 mask_element
= i
% const_nunits
;
5035 if (vec_index
== first_vec_index
5036 || first_vec_index
== -1)
5038 first_vec_index
= vec_index
;
5040 else if (vec_index
== second_vec_index
5041 || second_vec_index
== -1)
5043 second_vec_index
= vec_index
;
5044 mask_element
+= const_nunits
;
5048 if (dump_enabled_p ())
5049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5050 "permutation requires at "
5051 "least three vectors %G",
5053 gcc_assert (analyze_only
);
5057 gcc_assert (mask_element
< 2 * const_nunits
);
5060 if (mask_element
!= index
)
5062 mask
[index
++] = mask_element
;
5064 if (index
== count
&& !noop_p
)
5066 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5067 if (!can_vec_perm_const_p (mode
, indices
))
5069 if (dump_enabled_p ())
5071 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5073 "unsupported vect permute { ");
5074 for (i
= 0; i
< count
; ++i
)
5076 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5077 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5079 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5081 gcc_assert (analyze_only
);
5092 tree mask_vec
= NULL_TREE
;
5095 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5097 if (second_vec_index
== -1)
5098 second_vec_index
= first_vec_index
;
5100 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5102 /* Generate the permute statement if necessary. */
5103 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5104 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5108 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5110 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5112 perm_dest
= make_ssa_name (perm_dest
);
5114 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5115 first_vec
, second_vec
,
5117 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5121 /* If mask was NULL_TREE generate the requested
5122 identity transform. */
5123 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5125 /* Store the vector statement in NODE. */
5126 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5131 first_vec_index
= -1;
5132 second_vec_index
= -1;
5140 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5143 /* Enforced above when !repeating_p. */
5144 unsigned int const_nunits
= nunits
.to_constant ();
5146 bool load_seen
= false;
5147 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5149 if (i
% const_nunits
== 0)
5155 if (bitmap_bit_p (used_in_lanes
, i
))
5167 /* Vectorize the SLP permutations in NODE as specified
5168 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5169 child number and lane number.
5170 Interleaving of two two-lane two-child SLP subtrees (not supported):
5171 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5172 A blend of two four-lane two-child SLP subtrees:
5173 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5174 Highpart of a four-lane one-child SLP subtree (not supported):
5175 [ { 0, 2 }, { 0, 3 } ]
5176 Where currently only a subset is supported by code generating below. */
5179 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5180 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5182 tree vectype
= SLP_TREE_VECTYPE (node
);
5184 /* ??? We currently only support all same vector input and output types
5185 while the SLP IL should really do a concat + select and thus accept
5186 arbitrary mismatches. */
5189 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5190 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5192 if (dump_enabled_p ())
5193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5194 "Unsupported lane permutation\n");
5198 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5199 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5200 if (dump_enabled_p ())
5202 dump_printf_loc (MSG_NOTE
, vect_location
,
5203 "vectorizing permutation");
5204 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5205 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5206 dump_printf (MSG_NOTE
, "\n");
5209 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5210 if (!nunits
.is_constant ())
5212 unsigned HOST_WIDE_INT vf
= 1;
5213 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5214 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5216 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5217 gcc_assert (multiple_p (olanes
, nunits
));
5219 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5220 from the { SLP operand, scalar lane } permutation as recorded in the
5221 SLP node as intermediate step. This part should already work
5222 with SLP children with arbitrary number of lanes. */
5223 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5224 auto_vec
<unsigned> active_lane
;
5225 vperm
.create (olanes
);
5226 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5227 for (unsigned i
= 0; i
< vf
; ++i
)
5229 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5231 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5232 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5233 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5234 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5235 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5236 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5238 /* Advance to the next group. */
5239 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5240 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5243 if (dump_enabled_p ())
5245 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5246 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5248 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5249 dump_printf (MSG_NOTE
, ",");
5250 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5251 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5252 vperm
[i
].first
.second
);
5254 dump_printf (MSG_NOTE
, "\n");
5257 /* We can only handle two-vector permutes, everything else should
5258 be lowered on the SLP level. The following is closely inspired
5259 by vect_transform_slp_perm_load and is supposed to eventually
5261 ??? As intermediate step do code-gen in the SLP tree representation
5263 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5264 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5265 unsigned int const_nunits
= nunits
.to_constant ();
5266 unsigned int index
= 0;
5267 unsigned int mask_element
;
5268 vec_perm_builder mask
;
5269 mask
.new_vector (const_nunits
, const_nunits
, 1);
5270 unsigned int count
= mask
.encoded_nelts ();
5271 mask
.quick_grow (count
);
5272 vec_perm_indices indices
;
5273 unsigned nperms
= 0;
5274 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5276 mask_element
= vperm
[i
].second
;
5277 if (first_vec
.first
== -1U
5278 || first_vec
== vperm
[i
].first
)
5279 first_vec
= vperm
[i
].first
;
5280 else if (second_vec
.first
== -1U
5281 || second_vec
== vperm
[i
].first
)
5283 second_vec
= vperm
[i
].first
;
5284 mask_element
+= const_nunits
;
5288 if (dump_enabled_p ())
5289 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5290 "permutation requires at "
5291 "least three vectors\n");
5296 mask
[index
++] = mask_element
;
5300 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5302 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5304 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5306 if (dump_enabled_p ())
5308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5310 "unsupported vect permute { ");
5311 for (i
= 0; i
< count
; ++i
)
5313 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5314 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5316 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5326 if (second_vec
.first
== -1U)
5327 second_vec
= first_vec
;
5329 /* Generate the permute statement if necessary. */
5330 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5332 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5334 tree perm_dest
= make_ssa_name (vectype
);
5337 slp_tree second_node
5338 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5340 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5341 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5342 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5343 first_def
, second_def
,
5347 /* We need a copy here in case the def was external. */
5348 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5349 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5350 /* Store the vector statement in NODE. */
5351 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5355 first_vec
= std::make_pair (-1U, -1U);
5356 second_vec
= std::make_pair (-1U, -1U);
5361 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5366 /* Vectorize SLP NODE. */
5369 vect_schedule_slp_node (vec_info
*vinfo
,
5370 slp_tree node
, slp_instance instance
)
5372 gimple_stmt_iterator si
;
5376 /* For existing vectors there's nothing to do. */
5377 if (SLP_TREE_VEC_DEFS (node
).exists ())
5380 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5382 /* Vectorize externals and constants. */
5383 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5384 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
5386 /* ??? vectorizable_shift can end up using a scalar operand which is
5387 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5388 node in this case. */
5389 if (!SLP_TREE_VECTYPE (node
))
5392 vect_create_constant_vectors (vinfo
, node
);
5396 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5398 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
5399 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
5401 if (dump_enabled_p ())
5402 dump_printf_loc (MSG_NOTE
, vect_location
,
5403 "------>vectorizing SLP node starting from: %G",
5406 if (STMT_VINFO_DATA_REF (stmt_info
)
5407 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5409 /* Vectorized loads go before the first scalar load to make it
5410 ready early, vectorized stores go before the last scalar
5411 stmt which is where all uses are ready. */
5412 stmt_vec_info last_stmt_info
= NULL
;
5413 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
5414 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
5415 else /* DR_IS_WRITE */
5416 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
5417 si
= gsi_for_stmt (last_stmt_info
->stmt
);
5419 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
5420 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
5421 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
5422 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5424 /* For PHI node vectorization we do not use the insertion iterator. */
5429 /* Emit other stmts after the children vectorized defs which is
5430 earliest possible. */
5431 gimple
*last_stmt
= NULL
;
5432 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5433 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5435 /* For fold-left reductions we are retaining the scalar
5436 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5437 set so the representation isn't perfect. Resort to the
5438 last scalar def here. */
5439 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
5441 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
5442 == cycle_phi_info_type
);
5443 gphi
*phi
= as_a
<gphi
*>
5444 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
5446 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
5449 /* We are emitting all vectorized stmts in the same place and
5450 the last one is the last.
5451 ??? Unless we have a load permutation applied and that
5452 figures to re-use an earlier generated load. */
5455 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
5457 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5460 else if (!SLP_TREE_VECTYPE (child
))
5462 /* For externals we use unvectorized at all scalar defs. */
5465 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
5466 if (TREE_CODE (def
) == SSA_NAME
5467 && !SSA_NAME_IS_DEFAULT_DEF (def
))
5469 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
5471 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
5477 /* For externals we have to look at all defs since their
5478 insertion place is decided per vector. But beware
5479 of pre-existing vectors where we need to make sure
5480 we do not insert before the region boundary. */
5481 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
5482 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
5483 last_stmt
= gsi_stmt (gsi_after_labels
5484 (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]));
5489 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
5490 if (TREE_CODE (vdef
) == SSA_NAME
5491 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
5493 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
5495 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5500 /* This can happen when all children are pre-existing vectors or
5503 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
5504 if (is_a
<gphi
*> (last_stmt
))
5505 si
= gsi_after_labels (gimple_bb (last_stmt
));
5508 si
= gsi_for_stmt (last_stmt
);
5513 bool done_p
= false;
5515 /* Handle purely internal nodes. */
5516 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5518 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5519 be shared with different SLP nodes (but usually it's the same
5520 operation apart from the case the stmt is only there for denoting
5521 the actual scalar lane defs ...). So do not call vect_transform_stmt
5522 but open-code it here (partly). */
5523 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
5528 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
5531 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5532 For loop vectorization this is done in vectorizable_call, but for SLP
5533 it needs to be deferred until end of vect_schedule_slp, because multiple
5534 SLP instances may refer to the same scalar stmt. */
5537 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
5538 slp_tree node
, hash_set
<slp_tree
> &visited
)
5541 gimple_stmt_iterator gsi
;
5545 stmt_vec_info stmt_info
;
5547 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5550 if (visited
.add (node
))
5553 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5554 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
5556 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5558 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
5559 if (!stmt
|| gimple_bb (stmt
) == NULL
)
5561 if (is_pattern_stmt_p (stmt_info
)
5562 || !PURE_SLP_STMT (stmt_info
))
5564 lhs
= gimple_call_lhs (stmt
);
5565 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
5566 gsi
= gsi_for_stmt (stmt
);
5567 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
5568 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
5573 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
5575 hash_set
<slp_tree
> visited
;
5576 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
5579 /* Vectorize the instance root. */
5582 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
5584 gassign
*rstmt
= NULL
;
5586 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
5591 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5593 tree vect_lhs
= gimple_get_lhs (child_stmt
);
5594 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5595 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
5596 TREE_TYPE (vect_lhs
)))
5597 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
5599 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
5603 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
5605 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5608 vec
<constructor_elt
, va_gc
> *v
;
5609 vec_alloc (v
, nelts
);
5611 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5613 CONSTRUCTOR_APPEND_ELT (v
,
5615 gimple_get_lhs (child_stmt
));
5617 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5618 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
5619 tree r_constructor
= build_constructor (rtype
, v
);
5620 rstmt
= gimple_build_assign (lhs
, r_constructor
);
5625 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
5626 gsi_replace (&rgsi
, rstmt
, true);
5636 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5639 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
5640 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
5641 int &maxdfs
, vec
<slp_tree
> &stack
)
5644 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
5645 gcc_assert (!existed_p
);
5647 info
->lowlink
= maxdfs
;
5651 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5653 info
->on_stack
= false;
5654 vect_schedule_slp_node (vinfo
, node
, instance
);
5658 info
->on_stack
= true;
5659 stack
.safe_push (node
);
5664 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5668 slp_scc_info
*child_info
= scc_info
.get (child
);
5671 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
5672 /* Recursion might have re-allocated the node. */
5673 info
= scc_info
.get (node
);
5674 child_info
= scc_info
.get (child
);
5675 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
5677 else if (child_info
->on_stack
)
5678 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
5680 if (info
->lowlink
!= info
->dfs
)
5683 auto_vec
<slp_tree
, 4> phis_to_fixup
;
5686 if (stack
.last () == node
)
5689 info
->on_stack
= false;
5690 vect_schedule_slp_node (vinfo
, node
, instance
);
5691 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
5692 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
5693 phis_to_fixup
.quick_push (node
);
5698 int last_idx
= stack
.length () - 1;
5699 while (stack
[last_idx
] != node
)
5701 /* We can break the cycle at PHIs who have at least one child
5702 code generated. Then we could re-start the DFS walk until
5703 all nodes in the SCC are covered (we might have new entries
5704 for only back-reachable nodes). But it's simpler to just
5705 iterate and schedule those that are ready. */
5706 unsigned todo
= stack
.length () - last_idx
;
5709 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
5711 slp_tree entry
= stack
[idx
];
5714 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
5715 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
5717 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
5724 else if (scc_info
.get (child
)->on_stack
)
5742 vect_schedule_slp_node (vinfo
, entry
, instance
);
5743 scc_info
.get (entry
)->on_stack
= false;
5747 phis_to_fixup
.safe_push (entry
);
5754 stack
.truncate (last_idx
);
5757 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5759 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
5761 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
5764 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
5766 unsigned dest_idx
= e
->dest_idx
;
5767 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
5768 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
5770 /* Simply fill all args. */
5771 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
5772 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
5773 vect_get_slp_vect_def (child
, i
),
5774 e
, gimple_phi_arg_location (phi
, dest_idx
));
5779 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5782 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
5784 slp_instance instance
;
5787 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
5789 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5791 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5792 if (dump_enabled_p ())
5794 dump_printf_loc (MSG_NOTE
, vect_location
,
5795 "Vectorizing SLP tree:\n");
5796 if (SLP_INSTANCE_ROOT_STMT (instance
))
5797 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
5798 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
5799 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5800 SLP_INSTANCE_TREE (instance
));
5802 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
5803 have a PHI be the node breaking the cycle. */
5804 auto_vec
<slp_tree
> stack
;
5805 if (!scc_info
.get (node
))
5806 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
5808 if (SLP_INSTANCE_ROOT_STMT (instance
))
5809 vectorize_slp_instance_root_stmt (node
, instance
);
5811 if (dump_enabled_p ())
5812 dump_printf_loc (MSG_NOTE
, vect_location
,
5813 "vectorizing stmts using SLP.\n");
5816 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5818 slp_tree root
= SLP_INSTANCE_TREE (instance
);
5819 stmt_vec_info store_info
;
5822 /* Remove scalar call stmts. Do not do this for basic-block
5823 vectorization as not all uses may be vectorized.
5824 ??? Why should this be necessary? DCE should be able to
5825 remove the stmts itself.
5826 ??? For BB vectorization we can as well remove scalar
5827 stmts starting from the SLP tree root if they have no
5829 if (is_a
<loop_vec_info
> (vinfo
))
5830 vect_remove_slp_scalar_calls (vinfo
, root
);
5832 /* Remove vectorized stores original scalar stmts. */
5833 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
5835 if (!STMT_VINFO_DATA_REF (store_info
)
5836 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
5839 store_info
= vect_orig_stmt (store_info
);
5840 /* Free the attached stmt_vec_info and remove the stmt. */
5841 vinfo
->remove_stmt (store_info
);