1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
152 /* Return a location suitable for dumpings related to the SLP instance. */
155 _slp_instance::location () const
158 return root_stmt
->stmt
;
160 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
164 /* Free the memory allocated for the SLP instance. */
167 vect_free_slp_instance (slp_instance instance
)
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
170 SLP_INSTANCE_LOADS (instance
).release ();
171 instance
->subgraph_entries
.release ();
172 instance
->cost_vec
.release ();
177 /* Create an SLP node for SCALAR_STMTS. */
180 vect_create_new_slp_node (slp_tree node
,
181 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
183 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
184 SLP_TREE_CHILDREN (node
).create (nops
);
185 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
186 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
187 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
191 /* Create an SLP node for SCALAR_STMTS. */
194 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
196 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
199 /* Create an SLP node for OPS. */
202 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
204 SLP_TREE_SCALAR_OPS (node
) = ops
;
205 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
206 SLP_TREE_LANES (node
) = ops
.length ();
210 /* Create an SLP node for OPS. */
213 vect_create_new_slp_node (vec
<tree
> ops
)
215 return vect_create_new_slp_node (new _slp_tree
, ops
);
219 /* This structure is used in creation of an SLP tree. Each instance
220 corresponds to the same operand in a group of scalar stmts in an SLP
222 typedef struct _slp_oprnd_info
224 /* Def-stmts for the operands. */
225 vec
<stmt_vec_info
> def_stmts
;
228 /* Information about the first statement, its vector def-type, type, the
229 operand itself in case it's constant, and an indication if it's a pattern
232 enum vect_def_type first_dt
;
237 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
239 static vec
<slp_oprnd_info
>
240 vect_create_oprnd_info (int nops
, int group_size
)
243 slp_oprnd_info oprnd_info
;
244 vec
<slp_oprnd_info
> oprnds_info
;
246 oprnds_info
.create (nops
);
247 for (i
= 0; i
< nops
; i
++)
249 oprnd_info
= XNEW (struct _slp_oprnd_info
);
250 oprnd_info
->def_stmts
.create (group_size
);
251 oprnd_info
->ops
.create (group_size
);
252 oprnd_info
->first_dt
= vect_uninitialized_def
;
253 oprnd_info
->first_op_type
= NULL_TREE
;
254 oprnd_info
->any_pattern
= false;
255 oprnds_info
.quick_push (oprnd_info
);
262 /* Free operands info. */
265 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
268 slp_oprnd_info oprnd_info
;
270 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
272 oprnd_info
->def_stmts
.release ();
273 oprnd_info
->ops
.release ();
274 XDELETE (oprnd_info
);
277 oprnds_info
.release ();
281 /* Return true if STMTS contains a pattern statement. */
284 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
286 stmt_vec_info stmt_info
;
288 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
289 if (is_pattern_stmt_p (stmt_info
))
294 /* Return true when all lanes in the external or constant NODE have
298 vect_slp_tree_uniform_p (slp_tree node
)
300 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
301 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
303 /* Pre-exsting vectors. */
304 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
308 tree op
, first
= NULL_TREE
;
309 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
312 else if (!operand_equal_p (first
, op
, 0))
318 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
319 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
323 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
324 stmt_vec_info first_stmt_info
)
326 stmt_vec_info next_stmt_info
= first_stmt_info
;
329 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
334 if (next_stmt_info
== stmt_info
)
336 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
338 result
+= DR_GROUP_GAP (next_stmt_info
);
340 while (next_stmt_info
);
345 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
346 using the method implemented by duplicate_and_interleave. Return true
347 if so, returning the number of intermediate vectors in *NVECTORS_OUT
348 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
352 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
353 tree elt_type
, unsigned int *nvectors_out
,
354 tree
*vector_type_out
,
357 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
358 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
361 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
362 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
363 unsigned int nvectors
= 1;
366 scalar_int_mode int_mode
;
367 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
368 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
370 /* Get the natural vector type for this SLP group size. */
371 tree int_type
= build_nonstandard_integer_type
372 (GET_MODE_BITSIZE (int_mode
), 1);
374 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
376 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
377 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
378 GET_MODE_SIZE (base_vector_mode
)))
380 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
381 together into elements of type INT_TYPE and using the result
382 to build NVECTORS vectors. */
383 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
384 vec_perm_builder
sel1 (nelts
, 2, 3);
385 vec_perm_builder
sel2 (nelts
, 2, 3);
386 poly_int64 half_nelts
= exact_div (nelts
, 2);
387 for (unsigned int i
= 0; i
< 3; ++i
)
390 sel1
.quick_push (i
+ nelts
);
391 sel2
.quick_push (half_nelts
+ i
);
392 sel2
.quick_push (half_nelts
+ i
+ nelts
);
394 vec_perm_indices
indices1 (sel1
, 2, nelts
);
395 vec_perm_indices
indices2 (sel2
, 2, nelts
);
396 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
397 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
400 *nvectors_out
= nvectors
;
402 *vector_type_out
= vector_type
;
405 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
407 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
414 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
420 /* Return true if DTA and DTB match. */
423 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
426 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
427 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
430 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
431 they are of a valid type and that they match the defs of the first stmt of
432 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
433 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
434 indicates swap is required for cond_expr stmts. Specifically, *SWAP
435 is 1 if STMT is cond and operands of comparison need to be swapped;
436 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
437 If there is any operand swap in this function, *SWAP is set to non-zero
439 If there was a fatal error return -1; if the error could be corrected by
440 swapping operands of father node of this one, return 1; if everything is
443 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
445 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
446 vec
<slp_oprnd_info
> *oprnds_info
)
448 stmt_vec_info stmt_info
= stmts
[stmt_num
];
450 unsigned int i
, number_of_oprnds
;
451 enum vect_def_type dt
= vect_uninitialized_def
;
452 slp_oprnd_info oprnd_info
;
453 int first_op_idx
= 1;
454 unsigned int commutative_op
= -1U;
455 bool first_op_cond
= false;
456 bool first
= stmt_num
== 0;
458 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
460 number_of_oprnds
= gimple_call_num_args (stmt
);
462 if (gimple_call_internal_p (stmt
))
464 internal_fn ifn
= gimple_call_internal_fn (stmt
);
465 commutative_op
= first_commutative_argument (ifn
);
467 /* Masked load, only look at mask. */
468 if (ifn
== IFN_MASK_LOAD
)
470 number_of_oprnds
= 1;
471 /* Mask operand index. */
476 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
478 enum tree_code code
= gimple_assign_rhs_code (stmt
);
479 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
480 /* Swap can only be done for cond_expr if asked to, otherwise we
481 could result in different comparison code to the first stmt. */
482 if (code
== COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
485 first_op_cond
= true;
489 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
491 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
492 number_of_oprnds
= gimple_phi_num_args (stmt
);
496 bool swapped
= (swap
!= 0);
497 bool backedge
= false;
498 gcc_assert (!swapped
|| first_op_cond
);
499 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
500 for (i
= 0; i
< number_of_oprnds
; i
++)
504 /* Map indicating how operands of cond_expr should be swapped. */
505 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
506 int *map
= maps
[swap
];
509 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
510 first_op_idx
), map
[i
]);
512 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
514 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
516 oprnd
= gimple_phi_arg_def (stmt
, i
);
517 backedge
= dominated_by_p (CDI_DOMINATORS
,
518 gimple_phi_arg_edge (stmt
, i
)->src
,
519 gimple_bb (stmt_info
->stmt
));
522 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
523 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
524 oprnd
= TREE_OPERAND (oprnd
, 0);
526 oprnd_info
= (*oprnds_info
)[i
];
528 stmt_vec_info def_stmt_info
;
529 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
531 if (dump_enabled_p ())
532 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
533 "Build SLP failed: can't analyze def for %T\n",
541 oprnd_info
->def_stmts
.quick_push (NULL
);
542 oprnd_info
->ops
.quick_push (NULL_TREE
);
543 oprnd_info
->first_dt
= vect_uninitialized_def
;
547 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
548 oprnd_info
->ops
.quick_push (oprnd
);
551 && is_pattern_stmt_p (def_stmt_info
))
553 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
555 oprnd_info
->any_pattern
= true;
557 /* If we promote this to external use the original stmt def. */
558 oprnd_info
->ops
.last ()
559 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
562 /* If there's a extern def on a backedge make sure we can
563 code-generate at the region start.
564 ??? This is another case that could be fixed by adjusting
565 how we split the function but at the moment we'd have conflicting
568 && dts
[i
] == vect_external_def
569 && is_a
<bb_vec_info
> (vinfo
)
570 && TREE_CODE (oprnd
) == SSA_NAME
571 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
572 && !dominated_by_p (CDI_DOMINATORS
,
573 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
574 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
578 "Build SLP failed: extern def %T only defined "
579 "on backedge\n", oprnd
);
585 tree type
= TREE_TYPE (oprnd
);
587 if ((dt
== vect_constant_def
588 || dt
== vect_external_def
)
589 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
590 && (TREE_CODE (type
) == BOOLEAN_TYPE
591 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
594 if (dump_enabled_p ())
595 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
596 "Build SLP failed: invalid type of def "
597 "for variable-length SLP %T\n", oprnd
);
601 /* For the swapping logic below force vect_reduction_def
602 for the reduction op in a SLP reduction group. */
603 if (!STMT_VINFO_DATA_REF (stmt_info
)
604 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
605 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
607 dts
[i
] = dt
= vect_reduction_def
;
609 /* Check the types of the definition. */
612 case vect_external_def
:
613 case vect_constant_def
:
614 case vect_internal_def
:
615 case vect_reduction_def
:
616 case vect_induction_def
:
617 case vect_nested_cycle
:
621 /* FORNOW: Not supported. */
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
624 "Build SLP failed: illegal type of def %T\n",
629 oprnd_info
->first_dt
= dt
;
630 oprnd_info
->first_op_type
= type
;
636 /* Now match the operand definition types to that of the first stmt. */
637 for (i
= 0; i
< number_of_oprnds
;)
645 oprnd_info
= (*oprnds_info
)[i
];
647 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
648 oprnd
= oprnd_info
->ops
[stmt_num
];
649 tree type
= TREE_TYPE (oprnd
);
651 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
655 "Build SLP failed: different operand types\n");
659 /* Not first stmt of the group, check that the def-stmt/s match
660 the def-stmt/s of the first stmt. Allow different definition
661 types for reduction chains: the first stmt must be a
662 vect_reduction_def (a phi node), and the rest
663 end in the reduction chain. */
664 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
665 && !(oprnd_info
->first_dt
== vect_reduction_def
666 && !STMT_VINFO_DATA_REF (stmt_info
)
667 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
669 && !STMT_VINFO_DATA_REF (def_stmt_info
)
670 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
671 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
672 || (!STMT_VINFO_DATA_REF (stmt_info
)
673 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
675 || STMT_VINFO_DATA_REF (def_stmt_info
)
676 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
677 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
678 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
680 /* Try swapping operands if we got a mismatch. For BB
681 vectorization only in case it will clearly improve things. */
682 if (i
== commutative_op
&& !swapped
683 && (!is_a
<bb_vec_info
> (vinfo
)
684 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
686 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
687 || vect_def_types_match
688 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
690 if (dump_enabled_p ())
691 dump_printf_loc (MSG_NOTE
, vect_location
,
692 "trying swapped operands\n");
693 std::swap (dts
[i
], dts
[i
+1]);
694 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
695 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
696 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
697 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
702 if (is_a
<bb_vec_info
> (vinfo
)
703 && !oprnd_info
->any_pattern
)
705 /* Now for commutative ops we should see whether we can
706 make the other operand matching. */
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
709 "treating operand as external\n");
710 oprnd_info
->first_dt
= dt
= vect_external_def
;
714 if (dump_enabled_p ())
715 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
716 "Build SLP failed: different types\n");
721 /* Make sure to demote the overall operand to external. */
722 if (dt
== vect_external_def
)
723 oprnd_info
->first_dt
= vect_external_def
;
724 /* For a SLP reduction chain we want to duplicate the reduction to
725 each of the chain members. That gets us a sane SLP graph (still
726 the stmts are not 100% correct wrt the initial values). */
727 else if ((dt
== vect_internal_def
728 || dt
== vect_reduction_def
)
729 && oprnd_info
->first_dt
== vect_reduction_def
730 && !STMT_VINFO_DATA_REF (stmt_info
)
731 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
732 && !STMT_VINFO_DATA_REF (def_stmt_info
)
733 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
734 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
736 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
737 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
746 if (dump_enabled_p ())
747 dump_printf_loc (MSG_NOTE
, vect_location
,
748 "swapped operands to match def types in %G",
755 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
756 Return true if we can, meaning that this choice doesn't conflict with
757 existing SLP nodes that use STMT_INFO. */
760 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
762 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
764 return useless_type_conversion_p (vectype
, old_vectype
);
766 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
768 /* We maintain the invariant that if any statement in the group is
769 used, all other members of the group have the same vector type. */
770 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
771 stmt_vec_info member_info
= first_info
;
772 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
773 if (is_pattern_stmt_p (member_info
)
774 && !useless_type_conversion_p (vectype
,
775 STMT_VINFO_VECTYPE (member_info
)))
780 for (member_info
= first_info
; member_info
;
781 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
782 STMT_VINFO_VECTYPE (member_info
) = vectype
;
786 else if (!is_pattern_stmt_p (stmt_info
))
788 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
792 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
795 "Build SLP failed: incompatible vector"
796 " types for: %G", stmt_info
->stmt
);
797 dump_printf_loc (MSG_NOTE
, vect_location
,
798 " old vector type: %T\n", old_vectype
);
799 dump_printf_loc (MSG_NOTE
, vect_location
,
800 " new vector type: %T\n", vectype
);
805 /* Return true if call statements CALL1 and CALL2 are similar enough
806 to be combined into the same SLP group. */
809 compatible_calls_p (gcall
*call1
, gcall
*call2
)
811 unsigned int nargs
= gimple_call_num_args (call1
);
812 if (nargs
!= gimple_call_num_args (call2
))
815 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
818 if (gimple_call_internal_p (call1
))
820 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
821 TREE_TYPE (gimple_call_lhs (call2
))))
823 for (unsigned int i
= 0; i
< nargs
; ++i
)
824 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
825 TREE_TYPE (gimple_call_arg (call2
, i
))))
830 if (!operand_equal_p (gimple_call_fn (call1
),
831 gimple_call_fn (call2
), 0))
834 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
840 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
841 caller's attempt to find the vector type in STMT_INFO with the narrowest
842 element type. Return true if VECTYPE is nonnull and if it is valid
843 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
844 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
845 vect_build_slp_tree. */
848 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
849 unsigned int group_size
,
850 tree vectype
, poly_uint64
*max_nunits
)
854 if (dump_enabled_p ())
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
856 "Build SLP failed: unsupported data-type in %G\n",
858 /* Fatal mismatch. */
862 /* If populating the vector type requires unrolling then fail
863 before adjusting *max_nunits for basic-block vectorization. */
864 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
865 unsigned HOST_WIDE_INT const_nunits
;
866 if (is_a
<bb_vec_info
> (vinfo
)
867 && (!nunits
.is_constant (&const_nunits
)
868 || const_nunits
> group_size
))
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
872 "Build SLP failed: unrolling required "
873 "in basic block SLP\n");
874 /* Fatal mismatch. */
878 /* In case of multiple types we need to detect the smallest type. */
879 vect_update_max_nunits (max_nunits
, vectype
);
883 /* Verify if the scalar stmts STMTS are isomorphic, require data
884 permutation or are of unsupported types of operation. Return
885 true if they are, otherwise return false and indicate in *MATCHES
886 which stmts are not isomorphic to the first one. If MATCHES[0]
887 is false then this indicates the comparison could not be
888 carried out or the stmts will never be vectorized by SLP.
890 Note COND_EXPR is possibly isomorphic to another one after swapping its
891 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
892 the first stmt by swapping the two operands of comparison; set SWAP[i]
893 to 2 if stmt I is isormorphic to the first stmt by inverting the code
894 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
895 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
898 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
899 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
900 poly_uint64
*max_nunits
, bool *matches
,
901 bool *two_operators
, tree
*node_vectype
)
904 stmt_vec_info first_stmt_info
= stmts
[0];
905 enum tree_code first_stmt_code
= ERROR_MARK
;
906 enum tree_code alt_stmt_code
= ERROR_MARK
;
907 enum tree_code rhs_code
= ERROR_MARK
;
908 enum tree_code first_cond_code
= ERROR_MARK
;
910 bool need_same_oprnds
= false;
911 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
914 machine_mode optab_op2_mode
;
915 machine_mode vec_mode
;
916 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
917 bool first_stmt_load_p
= false, load_p
= false;
918 bool first_stmt_phi_p
= false, phi_p
= false;
920 /* For every stmt in NODE find its def stmt/s. */
921 stmt_vec_info stmt_info
;
922 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
924 gimple
*stmt
= stmt_info
->stmt
;
928 if (dump_enabled_p ())
929 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
931 /* Fail to vectorize statements marked as unvectorizable, throw
933 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
934 || stmt_can_throw_internal (cfun
, stmt
)
935 || gimple_has_volatile_ops (stmt
))
937 if (dump_enabled_p ())
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
939 "Build SLP failed: unvectorizable statement %G",
941 /* ??? For BB vectorization we want to commutate operands in a way
942 to shuffle all unvectorizable defs into one operand and have
943 the other still vectorized. The following doesn't reliably
944 work for this though but it's the easiest we can do here. */
945 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
947 /* Fatal mismatch. */
952 lhs
= gimple_get_lhs (stmt
);
953 if (lhs
== NULL_TREE
)
955 if (dump_enabled_p ())
956 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
957 "Build SLP failed: not GIMPLE_ASSIGN nor "
958 "GIMPLE_CALL %G", stmt
);
959 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
961 /* Fatal mismatch. */
967 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
968 &nunits_vectype
, group_size
)
970 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
971 nunits_vectype
, max_nunits
)))
973 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
975 /* Fatal mismatch. */
980 gcc_assert (vectype
);
982 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
985 rhs_code
= CALL_EXPR
;
987 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
989 else if ((gimple_call_internal_p (call_stmt
)
990 && (!vectorizable_internal_fn_p
991 (gimple_call_internal_fn (call_stmt
))))
992 || gimple_call_tail_p (call_stmt
)
993 || gimple_call_noreturn_p (call_stmt
)
994 || !gimple_call_nothrow_p (call_stmt
)
995 || gimple_call_chain (call_stmt
))
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
999 "Build SLP failed: unsupported call type %G",
1001 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1003 /* Fatal mismatch. */
1008 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1010 rhs_code
= ERROR_MARK
;
1015 rhs_code
= gimple_assign_rhs_code (stmt
);
1016 load_p
= gimple_vuse (stmt
);
1019 /* Check the operation. */
1022 *node_vectype
= vectype
;
1023 first_stmt_code
= rhs_code
;
1024 first_stmt_load_p
= load_p
;
1025 first_stmt_phi_p
= phi_p
;
1027 /* Shift arguments should be equal in all the packed stmts for a
1028 vector shift with scalar shift operand. */
1029 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1030 || rhs_code
== LROTATE_EXPR
1031 || rhs_code
== RROTATE_EXPR
)
1033 vec_mode
= TYPE_MODE (vectype
);
1035 /* First see if we have a vector/vector shift. */
1036 optab
= optab_for_tree_code (rhs_code
, vectype
,
1040 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1042 /* No vector/vector shift, try for a vector/scalar shift. */
1043 optab
= optab_for_tree_code (rhs_code
, vectype
,
1048 if (dump_enabled_p ())
1049 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1050 "Build SLP failed: no optab.\n");
1051 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1053 /* Fatal mismatch. */
1057 icode
= (int) optab_handler (optab
, vec_mode
);
1058 if (icode
== CODE_FOR_nothing
)
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1062 "Build SLP failed: "
1063 "op not supported by target.\n");
1064 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1066 /* Fatal mismatch. */
1070 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1071 if (!VECTOR_MODE_P (optab_op2_mode
))
1073 need_same_oprnds
= true;
1074 first_op1
= gimple_assign_rhs2 (stmt
);
1078 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1080 need_same_oprnds
= true;
1081 first_op1
= gimple_assign_rhs2 (stmt
);
1084 && rhs_code
== BIT_FIELD_REF
)
1086 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1087 if (TREE_CODE (vec
) != SSA_NAME
1088 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
1090 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1094 "Build SLP failed: "
1095 "BIT_FIELD_REF not supported\n");
1096 /* Fatal mismatch. */
1102 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1104 need_same_oprnds
= true;
1105 first_op1
= gimple_call_arg (call_stmt
, 1);
1110 if (first_stmt_code
!= rhs_code
1111 && alt_stmt_code
== ERROR_MARK
)
1112 alt_stmt_code
= rhs_code
;
1113 if ((first_stmt_code
!= rhs_code
1114 && (first_stmt_code
!= IMAGPART_EXPR
1115 || rhs_code
!= REALPART_EXPR
)
1116 && (first_stmt_code
!= REALPART_EXPR
1117 || rhs_code
!= IMAGPART_EXPR
)
1118 /* Handle mismatches in plus/minus by computing both
1119 and merging the results. */
1120 && !((first_stmt_code
== PLUS_EXPR
1121 || first_stmt_code
== MINUS_EXPR
)
1122 && (alt_stmt_code
== PLUS_EXPR
1123 || alt_stmt_code
== MINUS_EXPR
)
1124 && rhs_code
== alt_stmt_code
)
1125 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1126 && (first_stmt_code
== ARRAY_REF
1127 || first_stmt_code
== BIT_FIELD_REF
1128 || first_stmt_code
== INDIRECT_REF
1129 || first_stmt_code
== COMPONENT_REF
1130 || first_stmt_code
== MEM_REF
)))
1131 || first_stmt_load_p
!= load_p
1132 || first_stmt_phi_p
!= phi_p
)
1134 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1137 "Build SLP failed: different operation "
1138 "in stmt %G", stmt
);
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1140 "original stmt %G", first_stmt_info
->stmt
);
1146 if (need_same_oprnds
)
1148 tree other_op1
= (call_stmt
1149 ? gimple_call_arg (call_stmt
, 1)
1150 : gimple_assign_rhs2 (stmt
));
1151 if (!operand_equal_p (first_op1
, other_op1
, 0))
1153 if (dump_enabled_p ())
1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1155 "Build SLP failed: different shift "
1156 "arguments in %G", stmt
);
1162 && first_stmt_code
== BIT_FIELD_REF
1163 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1164 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1166 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1168 "Build SLP failed: different BIT_FIELD_REF "
1169 "arguments in %G", stmt
);
1174 if (!load_p
&& rhs_code
== CALL_EXPR
)
1176 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1177 as_a
<gcall
*> (stmt
)))
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1181 "Build SLP failed: different calls in %G",
1189 && (gimple_bb (first_stmt_info
->stmt
)
1190 != gimple_bb (stmt_info
->stmt
)))
1192 if (dump_enabled_p ())
1193 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1194 "Build SLP failed: different BB for PHI "
1200 if (!types_compatible_p (vectype
, *node_vectype
))
1202 if (dump_enabled_p ())
1203 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1204 "Build SLP failed: different vector type "
1211 /* Grouped store or load. */
1212 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1214 if (REFERENCE_CLASS_P (lhs
))
1222 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1223 if (prev_first_load
)
1225 /* Check that there are no loads from different interleaving
1226 chains in the same node. */
1227 if (prev_first_load
!= first_load
)
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1232 "Build SLP failed: different "
1233 "interleaving chains in one node %G",
1240 prev_first_load
= first_load
;
1242 } /* Grouped access. */
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1250 "Build SLP failed: not grouped load %G", stmt
);
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1255 /* Fatal mismatch. */
1260 /* Not memory operation. */
1262 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1263 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1264 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1265 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1266 && rhs_code
!= VIEW_CONVERT_EXPR
1267 && rhs_code
!= CALL_EXPR
1268 && rhs_code
!= BIT_FIELD_REF
)
1270 if (dump_enabled_p ())
1271 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1272 "Build SLP failed: operation unsupported %G",
1274 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1276 /* Fatal mismatch. */
1281 if (rhs_code
== COND_EXPR
)
1283 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1284 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1285 enum tree_code swap_code
= ERROR_MARK
;
1286 enum tree_code invert_code
= ERROR_MARK
;
1289 first_cond_code
= TREE_CODE (cond_expr
);
1290 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1292 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1293 swap_code
= swap_tree_comparison (cond_code
);
1294 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1297 if (first_cond_code
== cond_code
)
1299 /* Isomorphic can be achieved by swapping. */
1300 else if (first_cond_code
== swap_code
)
1302 /* Isomorphic can be achieved by inverting. */
1303 else if (first_cond_code
== invert_code
)
1307 if (dump_enabled_p ())
1308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1309 "Build SLP failed: different"
1310 " operation %G", stmt
);
1320 for (i
= 0; i
< group_size
; ++i
)
1324 /* If we allowed a two-operation SLP node verify the target can cope
1325 with the permute we are going to use. */
1326 if (alt_stmt_code
!= ERROR_MARK
1327 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1329 *two_operators
= true;
1335 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1336 Note we never remove apart from at destruction time so we do not
1337 need a special value for deleted that differs from empty. */
1340 typedef vec
<stmt_vec_info
> value_type
;
1341 typedef vec
<stmt_vec_info
> compare_type
;
1342 static inline hashval_t
hash (value_type
);
1343 static inline bool equal (value_type existing
, value_type candidate
);
1344 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1345 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1346 static const bool empty_zero_p
= true;
1347 static inline void mark_empty (value_type
&x
) { x
.release (); }
1348 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1349 static inline void remove (value_type
&x
) { x
.release (); }
1352 bst_traits::hash (value_type x
)
1355 for (unsigned i
= 0; i
< x
.length (); ++i
)
1356 h
.add_int (gimple_uid (x
[i
]->stmt
));
1360 bst_traits::equal (value_type existing
, value_type candidate
)
1362 if (existing
.length () != candidate
.length ())
1364 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1365 if (existing
[i
] != candidate
[i
])
1370 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1371 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1372 scalar_stmts_to_slp_tree_map_t
;
1375 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1376 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1377 poly_uint64
*max_nunits
,
1378 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1379 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1382 vect_build_slp_tree (vec_info
*vinfo
,
1383 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1384 poly_uint64
*max_nunits
,
1385 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1386 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1388 if (slp_tree
*leader
= bst_map
->get (stmts
))
1390 if (dump_enabled_p ())
1391 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1392 *leader
? "" : "failed ", *leader
);
1395 SLP_TREE_REF_COUNT (*leader
)++;
1396 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1401 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1402 so we can pick up backedge destinations during discovery. */
1403 slp_tree res
= new _slp_tree
;
1404 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1405 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1406 bst_map
->put (stmts
.copy (), res
);
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_NOTE
, vect_location
,
1412 "SLP discovery limit exceeded\n");
1413 bool existed_p
= bst_map
->put (stmts
, NULL
);
1414 gcc_assert (existed_p
);
1415 /* Mark the node invalid so we can detect those when still in use
1416 as backedge destinations. */
1417 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1418 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1419 vect_free_slp_tree (res
);
1424 poly_uint64 this_max_nunits
= 1;
1425 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1427 matches
, limit
, tree_size
, bst_map
);
1430 bool existed_p
= bst_map
->put (stmts
, NULL
);
1431 gcc_assert (existed_p
);
1432 /* Mark the node invalid so we can detect those when still in use
1433 as backedge destinations. */
1434 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1435 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1436 vect_free_slp_tree (res
);
1440 gcc_assert (res_
== res
);
1441 res
->max_nunits
= this_max_nunits
;
1442 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1443 /* Keep a reference for the bst_map use. */
1444 SLP_TREE_REF_COUNT (res
)++;
1449 /* Recursively build an SLP tree starting from NODE.
1450 Fail (and return a value not equal to zero) if def-stmts are not
1451 isomorphic, require data permutation or are of unsupported types of
1452 operation. Otherwise, return 0.
1453 The value returned is the depth in the SLP tree where a mismatch
1457 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1458 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1459 poly_uint64
*max_nunits
,
1460 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1461 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1463 unsigned nops
, i
, this_tree_size
= 0;
1464 poly_uint64 this_max_nunits
= *max_nunits
;
1468 stmt_vec_info stmt_info
= stmts
[0];
1469 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1470 nops
= gimple_call_num_args (stmt
);
1471 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1473 nops
= gimple_num_ops (stmt
) - 1;
1474 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1477 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1478 nops
= gimple_phi_num_args (phi
);
1482 /* If the SLP node is a PHI (induction or reduction), terminate
1484 bool *skip_args
= XALLOCAVEC (bool, nops
);
1485 memset (skip_args
, 0, sizeof (bool) * nops
);
1486 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1487 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1489 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1490 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1492 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1496 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1497 if (def_type
== vect_induction_def
)
1499 /* Induction PHIs are not cycles but walk the initial
1500 value. Only for inner loops through, for outer loops
1501 we need to pick up the value from the actual PHIs
1502 to more easily support peeling and epilogue vectorization. */
1503 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1504 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1505 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1508 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1510 else if (def_type
== vect_reduction_def
1511 || def_type
== vect_double_reduction_def
1512 || def_type
== vect_nested_cycle
)
1514 /* Else def types have to match. */
1515 stmt_vec_info other_info
;
1516 bool all_same
= true;
1517 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1519 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1521 if (other_info
!= stmt_info
)
1524 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1525 /* Reduction initial values are not explicitely represented. */
1526 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1527 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1528 /* Reduction chain backedge defs are filled manually.
1529 ??? Need a better way to identify a SLP reduction chain PHI.
1530 Or a better overall way to SLP match those. */
1531 if (all_same
&& def_type
== vect_reduction_def
)
1532 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1534 else if (def_type
!= vect_internal_def
)
1539 bool two_operators
= false;
1540 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1541 tree vectype
= NULL_TREE
;
1542 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1543 &this_max_nunits
, matches
, &two_operators
,
1547 /* If the SLP node is a load, terminate the recursion unless masked. */
1548 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1549 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1551 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1554 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1559 *max_nunits
= this_max_nunits
;
1561 node
= vect_create_new_slp_node (node
, stmts
, 0);
1562 SLP_TREE_VECTYPE (node
) = vectype
;
1563 /* And compute the load permutation. Whether it is actually
1564 a permutation depends on the unrolling factor which is
1566 vec
<unsigned> load_permutation
;
1568 stmt_vec_info load_info
;
1569 load_permutation
.create (group_size
);
1570 stmt_vec_info first_stmt_info
1571 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1572 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1574 int load_place
= vect_get_place_in_interleaving_chain
1575 (load_info
, first_stmt_info
);
1576 gcc_assert (load_place
!= -1);
1577 load_permutation
.safe_push (load_place
);
1579 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1583 else if (gimple_assign_single_p (stmt_info
->stmt
)
1584 && !gimple_vuse (stmt_info
->stmt
)
1585 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1587 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1588 the same SSA name vector of a compatible type to vectype. */
1589 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1590 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1591 stmt_vec_info estmt_info
;
1592 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1594 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1595 tree bfref
= gimple_assign_rhs1 (estmt
);
1597 if (!known_eq (bit_field_size (bfref
),
1598 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1599 || !constant_multiple_p (bit_field_offset (bfref
),
1600 bit_field_size (bfref
), &lane
))
1605 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1607 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1608 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1609 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1610 /* We are always building a permutation node even if it is an identity
1611 permute to shield the rest of the vectorizer from the odd node
1612 representing an actual vector without any scalar ops.
1613 ??? We could hide it completely with making the permute node
1615 node
= vect_create_new_slp_node (node
, stmts
, 1);
1616 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1617 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1618 SLP_TREE_VECTYPE (node
) = vectype
;
1619 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1623 /* Get at the operands, verifying they are compatible. */
1624 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1625 slp_oprnd_info oprnd_info
;
1626 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1628 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1629 stmts
, i
, &oprnds_info
);
1631 matches
[(res
== -1) ? 0 : i
] = false;
1635 for (i
= 0; i
< group_size
; ++i
)
1638 vect_free_oprnd_info (oprnds_info
);
1643 auto_vec
<slp_tree
, 4> children
;
1645 stmt_info
= stmts
[0];
1647 /* Create SLP_TREE nodes for the definition node/s. */
1648 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1653 /* We're skipping certain operands from processing, for example
1654 outer loop reduction initial defs. */
1657 children
.safe_push (NULL
);
1661 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1663 /* COND_EXPR have one too many eventually if the condition
1665 gcc_assert (i
== 3 && nops
== 4);
1669 if (is_a
<bb_vec_info
> (vinfo
)
1670 && oprnd_info
->first_dt
== vect_internal_def
1671 && !oprnd_info
->any_pattern
)
1673 /* For BB vectorization, if all defs are the same do not
1674 bother to continue the build along the single-lane
1675 graph but use a splat of the scalar value. */
1676 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1677 for (j
= 1; j
< group_size
; ++j
)
1678 if (oprnd_info
->def_stmts
[j
] != first_def
)
1681 /* But avoid doing this for loads where we may be
1682 able to CSE things, unless the stmt is not
1684 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1685 || !gimple_vuse (first_def
->stmt
)))
1687 if (dump_enabled_p ())
1688 dump_printf_loc (MSG_NOTE
, vect_location
,
1689 "Using a splat of the uniform operand\n");
1690 oprnd_info
->first_dt
= vect_external_def
;
1694 if (oprnd_info
->first_dt
== vect_external_def
1695 || oprnd_info
->first_dt
== vect_constant_def
)
1697 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1698 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1699 oprnd_info
->ops
= vNULL
;
1700 children
.safe_push (invnode
);
1704 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1705 group_size
, &this_max_nunits
,
1707 &this_tree_size
, bst_map
)) != NULL
)
1709 oprnd_info
->def_stmts
= vNULL
;
1710 children
.safe_push (child
);
1714 /* If the SLP build for operand zero failed and operand zero
1715 and one can be commutated try that for the scalar stmts
1716 that failed the match. */
1718 /* A first scalar stmt mismatch signals a fatal mismatch. */
1720 /* ??? For COND_EXPRs we can swap the comparison operands
1721 as well as the arms under some constraints. */
1723 && oprnds_info
[1]->first_dt
== vect_internal_def
1724 && is_gimple_assign (stmt_info
->stmt
)
1725 /* Swapping operands for reductions breaks assumptions later on. */
1726 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1727 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1729 /* See whether we can swap the matching or the non-matching
1731 bool swap_not_matching
= true;
1734 for (j
= 0; j
< group_size
; ++j
)
1736 if (matches
[j
] != !swap_not_matching
)
1738 stmt_vec_info stmt_info
= stmts
[j
];
1739 /* Verify if we can swap operands of this stmt. */
1740 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1742 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1744 if (!swap_not_matching
)
1746 swap_not_matching
= false;
1751 while (j
!= group_size
);
1753 /* Swap mismatched definition stmts. */
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_NOTE
, vect_location
,
1756 "Re-trying with swapped operands of stmts ");
1757 for (j
= 0; j
< group_size
; ++j
)
1758 if (matches
[j
] == !swap_not_matching
)
1760 std::swap (oprnds_info
[0]->def_stmts
[j
],
1761 oprnds_info
[1]->def_stmts
[j
]);
1762 std::swap (oprnds_info
[0]->ops
[j
],
1763 oprnds_info
[1]->ops
[j
]);
1764 if (dump_enabled_p ())
1765 dump_printf (MSG_NOTE
, "%d ", j
);
1767 if (dump_enabled_p ())
1768 dump_printf (MSG_NOTE
, "\n");
1769 /* And try again with scratch 'matches' ... */
1770 bool *tem
= XALLOCAVEC (bool, group_size
);
1771 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1772 group_size
, &this_max_nunits
,
1774 &this_tree_size
, bst_map
)) != NULL
)
1776 oprnd_info
->def_stmts
= vNULL
;
1777 children
.safe_push (child
);
1783 /* If the SLP build failed and we analyze a basic-block
1784 simply treat nodes we fail to build as externally defined
1785 (and thus build vectors from the scalar defs).
1786 The cost model will reject outright expensive cases.
1787 ??? This doesn't treat cases where permutation ultimatively
1788 fails (or we don't try permutation below). Ideally we'd
1789 even compute a permutation that will end up with the maximum
1791 if (is_a
<bb_vec_info
> (vinfo
)
1792 /* ??? Rejecting patterns this way doesn't work. We'd have to
1793 do extra work to cancel the pattern so the uses see the
1795 && !is_pattern_stmt_p (stmt_info
)
1796 && !oprnd_info
->any_pattern
)
1798 /* But if there's a leading vector sized set of matching stmts
1799 fail here so we can split the group. This matches the condition
1800 vect_analyze_slp_instance uses. */
1801 /* ??? We might want to split here and combine the results to support
1802 multiple vector sizes better. */
1803 for (j
= 0; j
< group_size
; ++j
)
1806 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1808 if (dump_enabled_p ())
1809 dump_printf_loc (MSG_NOTE
, vect_location
,
1810 "Building vector operands from scalars\n");
1812 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1813 children
.safe_push (child
);
1814 oprnd_info
->ops
= vNULL
;
1819 gcc_assert (child
== NULL
);
1820 FOR_EACH_VEC_ELT (children
, j
, child
)
1822 vect_free_slp_tree (child
);
1823 vect_free_oprnd_info (oprnds_info
);
1827 vect_free_oprnd_info (oprnds_info
);
1829 /* If we have all children of a child built up from uniform scalars
1830 or does more than one possibly expensive vector construction then
1831 just throw that away, causing it built up from scalars.
1832 The exception is the SLP node for the vector store. */
1833 if (is_a
<bb_vec_info
> (vinfo
)
1834 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1835 /* ??? Rejecting patterns this way doesn't work. We'd have to
1836 do extra work to cancel the pattern so the uses see the
1838 && !is_pattern_stmt_p (stmt_info
))
1842 bool all_uniform_p
= true;
1843 unsigned n_vector_builds
= 0;
1844 FOR_EACH_VEC_ELT (children
, j
, child
)
1848 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1849 all_uniform_p
= false;
1850 else if (!vect_slp_tree_uniform_p (child
))
1852 all_uniform_p
= false;
1853 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1857 if (all_uniform_p
|| n_vector_builds
> 1)
1861 FOR_EACH_VEC_ELT (children
, j
, child
)
1863 vect_free_slp_tree (child
);
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_NOTE
, vect_location
,
1867 "Building parent vector operands from "
1868 "scalars instead\n");
1873 *tree_size
+= this_tree_size
+ 1;
1874 *max_nunits
= this_max_nunits
;
1878 /* ??? We'd likely want to either cache in bst_map sth like
1879 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1880 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1881 explicit stmts to put in so the keying on 'stmts' doesn't
1882 work (but we have the same issue with nodes that use 'ops'). */
1883 slp_tree one
= new _slp_tree
;
1884 slp_tree two
= new _slp_tree
;
1885 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1886 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1887 SLP_TREE_VECTYPE (one
) = vectype
;
1888 SLP_TREE_VECTYPE (two
) = vectype
;
1889 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1890 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1892 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1893 SLP_TREE_REF_COUNT (child
)++;
1895 /* Here we record the original defs since this
1896 node represents the final lane configuration. */
1897 node
= vect_create_new_slp_node (node
, stmts
, 2);
1898 SLP_TREE_VECTYPE (node
) = vectype
;
1899 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1900 SLP_TREE_CHILDREN (node
).quick_push (one
);
1901 SLP_TREE_CHILDREN (node
).quick_push (two
);
1902 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1903 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1904 enum tree_code ocode
= ERROR_MARK
;
1905 stmt_vec_info ostmt_info
;
1907 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1909 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1910 if (gimple_assign_rhs_code (ostmt
) != code0
)
1912 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1913 ocode
= gimple_assign_rhs_code (ostmt
);
1917 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1919 SLP_TREE_CODE (one
) = code0
;
1920 SLP_TREE_CODE (two
) = ocode
;
1921 SLP_TREE_LANES (one
) = stmts
.length ();
1922 SLP_TREE_LANES (two
) = stmts
.length ();
1923 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1924 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1928 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1929 SLP_TREE_VECTYPE (node
) = vectype
;
1930 SLP_TREE_CHILDREN (node
).splice (children
);
1934 /* Dump a single SLP tree NODE. */
1937 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1942 stmt_vec_info stmt_info
;
1945 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1946 dump_user_location_t user_loc
= loc
.get_user_location ();
1947 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1948 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1950 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1953 estimated_poly_value (node
->max_nunits
),
1954 SLP_TREE_REF_COUNT (node
));
1955 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
1957 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
1958 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
1960 dump_printf_loc (metadata
, user_loc
, "op template: %G",
1961 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
1963 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1964 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1965 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1968 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1969 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1970 dump_printf (metadata
, "%T%s ", op
,
1971 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1972 dump_printf (metadata
, "}\n");
1974 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1976 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1977 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1978 dump_printf (dump_kind
, " %u", j
);
1979 dump_printf (dump_kind
, " }\n");
1981 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1983 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1984 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1985 dump_printf (dump_kind
, " %u[%u]",
1986 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1987 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1988 dump_printf (dump_kind
, " }\n");
1990 if (SLP_TREE_CHILDREN (node
).is_empty ())
1992 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1994 dump_printf (dump_kind
, " %p", (void *)child
);
1995 dump_printf (dump_kind
, "\n");
1999 debug (slp_tree node
)
2001 debug_dump_context ctx
;
2002 vect_print_slp_tree (MSG_NOTE
,
2003 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2007 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2010 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2011 slp_tree node
, hash_set
<slp_tree
> &visited
)
2016 if (visited
.add (node
))
2019 vect_print_slp_tree (dump_kind
, loc
, node
);
2021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2023 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2027 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2030 hash_set
<slp_tree
> visited
;
2031 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2034 /* Mark the tree rooted at NODE with PURE_SLP. */
2037 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2040 stmt_vec_info stmt_info
;
2043 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2046 if (visited
.add (node
))
2049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2050 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2052 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2054 vect_mark_slp_stmts (child
, visited
);
2058 vect_mark_slp_stmts (slp_tree node
)
2060 hash_set
<slp_tree
> visited
;
2061 vect_mark_slp_stmts (node
, visited
);
2064 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2067 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2070 stmt_vec_info stmt_info
;
2073 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2076 if (visited
.add (node
))
2079 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2081 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2082 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2083 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2086 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2088 vect_mark_slp_stmts_relevant (child
, visited
);
2092 vect_mark_slp_stmts_relevant (slp_tree node
)
2094 hash_set
<slp_tree
> visited
;
2095 vect_mark_slp_stmts_relevant (node
, visited
);
2099 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2102 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2103 hash_set
<slp_tree
> &visited
)
2105 if (!node
|| visited
.add (node
))
2108 if (SLP_TREE_CHILDREN (node
).length () == 0)
2110 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2112 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2113 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2114 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2115 loads
.safe_push (node
);
2121 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2122 vect_gather_slp_loads (loads
, child
, visited
);
2127 /* Find the last store in SLP INSTANCE. */
2130 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2132 stmt_vec_info last
= NULL
;
2133 stmt_vec_info stmt_vinfo
;
2135 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2137 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2138 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2144 /* Find the first stmt in NODE. */
2147 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2149 stmt_vec_info first
= NULL
;
2150 stmt_vec_info stmt_vinfo
;
2152 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2154 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2156 || get_later_stmt (stmt_vinfo
, first
) == first
)
2163 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2164 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2165 (also containing the first GROUP1_SIZE stmts, since stores are
2166 consecutive), the second containing the remainder.
2167 Return the first stmt in the second group. */
2169 static stmt_vec_info
2170 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2172 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2173 gcc_assert (group1_size
> 0);
2174 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2175 gcc_assert (group2_size
> 0);
2176 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2178 stmt_vec_info stmt_info
= first_vinfo
;
2179 for (unsigned i
= group1_size
; i
> 1; i
--)
2181 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2182 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2184 /* STMT is now the last element of the first group. */
2185 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2186 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2188 DR_GROUP_SIZE (group2
) = group2_size
;
2189 for (stmt_info
= group2
; stmt_info
;
2190 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2192 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2193 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2196 /* For the second group, the DR_GROUP_GAP is that before the original group,
2197 plus skipping over the first vector. */
2198 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2200 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2201 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2205 group1_size
, group2_size
);
2210 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2211 statements and a vector of NUNITS elements. */
2214 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2216 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2220 vect_analyze_slp_instance (vec_info
*vinfo
,
2221 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2222 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2223 unsigned max_tree_size
, unsigned *limit
);
2225 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2226 of KIND. Return true if successful. */
2229 vect_build_slp_instance (vec_info
*vinfo
,
2230 slp_instance_kind kind
,
2231 vec
<stmt_vec_info
> &scalar_stmts
,
2232 stmt_vec_info root_stmt_info
,
2233 unsigned max_tree_size
, unsigned *limit
,
2234 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2235 /* ??? We need stmt_info for group splitting. */
2236 stmt_vec_info stmt_info_
)
2238 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_NOTE
, vect_location
,
2241 "Starting SLP discovery for\n");
2242 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2243 dump_printf_loc (MSG_NOTE
, vect_location
,
2244 " %G", scalar_stmts
[i
]->stmt
);
2247 /* Build the tree for the SLP instance. */
2248 unsigned int group_size
= scalar_stmts
.length ();
2249 bool *matches
= XALLOCAVEC (bool, group_size
);
2250 poly_uint64 max_nunits
= 1;
2251 unsigned tree_size
= 0;
2253 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2254 &max_nunits
, matches
, limit
,
2255 &tree_size
, bst_map
);
2258 /* Calculate the unrolling factor based on the smallest type. */
2259 poly_uint64 unrolling_factor
2260 = calculate_unrolling_factor (max_nunits
, group_size
);
2262 if (maybe_ne (unrolling_factor
, 1U)
2263 && is_a
<bb_vec_info
> (vinfo
))
2265 unsigned HOST_WIDE_INT const_max_nunits
;
2266 if (!max_nunits
.is_constant (&const_max_nunits
)
2267 || const_max_nunits
> group_size
)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2271 "Build SLP failed: store group "
2272 "size not a multiple of the vector size "
2273 "in basic block SLP\n");
2274 vect_free_slp_tree (node
);
2277 /* Fatal mismatch. */
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE
, vect_location
,
2280 "SLP discovery succeeded but node needs "
2282 memset (matches
, true, group_size
);
2283 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2284 vect_free_slp_tree (node
);
2288 /* Create a new SLP instance. */
2289 slp_instance new_instance
= XNEW (class _slp_instance
);
2290 SLP_INSTANCE_TREE (new_instance
) = node
;
2291 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2292 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2293 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2294 SLP_INSTANCE_KIND (new_instance
) = kind
;
2295 new_instance
->reduc_phis
= NULL
;
2296 new_instance
->cost_vec
= vNULL
;
2297 new_instance
->subgraph_entries
= vNULL
;
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_NOTE
, vect_location
,
2301 "SLP size %u vs. limit %u.\n",
2302 tree_size
, max_tree_size
);
2304 /* Fixup SLP reduction chains. */
2305 if (kind
== slp_inst_kind_reduc_chain
)
2307 /* If this is a reduction chain with a conversion in front
2308 amend the SLP tree with a node for that. */
2310 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2311 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2313 /* Get at the conversion stmt - we know it's the single use
2314 of the last stmt of the reduction chain. */
2315 use_operand_p use_p
;
2316 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2317 &use_p
, &scalar_def
);
2319 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2320 next_info
= vect_stmt_to_vectorize (next_info
);
2321 scalar_stmts
= vNULL
;
2322 scalar_stmts
.create (group_size
);
2323 for (unsigned i
= 0; i
< group_size
; ++i
)
2324 scalar_stmts
.quick_push (next_info
);
2325 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2326 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2327 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2328 SLP_INSTANCE_TREE (new_instance
) = conv
;
2329 /* We also have to fake this conversion stmt as SLP reduction
2330 group so we don't have to mess with too much code
2332 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2333 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2335 /* Fill the backedge child of the PHI SLP node. The
2336 general matching code cannot find it because the
2337 scalar code does not reflect how we vectorize the
2339 use_operand_p use_p
;
2340 imm_use_iterator imm_iter
;
2341 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2342 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2343 gimple_get_lhs (scalar_def
))
2344 /* There are exactly two non-debug uses, the reduction
2345 PHI and the loop-closed PHI node. */
2346 if (!is_gimple_debug (USE_STMT (use_p
))
2347 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2349 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2350 stmt_vec_info phi_info
2351 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2352 for (unsigned i
= 0; i
< group_size
; ++i
)
2353 phis
.quick_push (phi_info
);
2354 slp_tree
*phi_node
= bst_map
->get (phis
);
2355 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2356 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2357 = SLP_INSTANCE_TREE (new_instance
);
2358 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2362 vinfo
->slp_instances
.safe_push (new_instance
);
2364 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2365 the number of scalar stmts in the root in a few places.
2366 Verify that assumption holds. */
2367 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2368 .length () == group_size
);
2370 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE
, vect_location
,
2373 "Final SLP tree for instance %p:\n", new_instance
);
2374 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2375 SLP_INSTANCE_TREE (new_instance
));
2383 /* Failed to SLP. */
2384 /* Free the allocated memory. */
2385 scalar_stmts
.release ();
2388 stmt_vec_info stmt_info
= stmt_info_
;
2389 /* Try to break the group up into pieces. */
2390 if (kind
== slp_inst_kind_store
)
2392 /* ??? We could delay all the actual splitting of store-groups
2393 until after SLP discovery of the original group completed.
2394 Then we can recurse to vect_build_slp_instance directly. */
2395 for (i
= 0; i
< group_size
; i
++)
2399 /* For basic block SLP, try to break the group up into multiples of
2401 if (is_a
<bb_vec_info
> (vinfo
)
2402 && (i
> 1 && i
< group_size
))
2405 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2406 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2407 1 << floor_log2 (i
));
2408 unsigned HOST_WIDE_INT const_nunits
;
2410 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2412 /* Split into two groups at the first vector boundary. */
2413 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2414 unsigned group1_size
= i
& ~(const_nunits
- 1);
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_NOTE
, vect_location
,
2418 "Splitting SLP group at stmt %u\n", i
);
2419 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2421 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2422 kind
, max_tree_size
,
2424 /* Split the rest at the failure point and possibly
2425 re-analyze the remaining matching part if it has
2426 at least two lanes. */
2428 && (i
+ 1 < group_size
2429 || i
- group1_size
> 1))
2431 stmt_vec_info rest2
= rest
;
2432 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2433 if (i
- group1_size
> 1)
2434 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2435 kind
, max_tree_size
,
2438 /* Re-analyze the non-matching tail if it has at least
2440 if (i
+ 1 < group_size
)
2441 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2442 rest
, kind
, max_tree_size
,
2448 /* For loop vectorization split into arbitrary pieces of size > 1. */
2449 if (is_a
<loop_vec_info
> (vinfo
)
2450 && (i
> 1 && i
< group_size
))
2452 unsigned group1_size
= i
;
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_NOTE
, vect_location
,
2456 "Splitting SLP group at stmt %u\n", i
);
2458 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2460 /* Loop vectorization cannot handle gaps in stores, make sure
2461 the split group appears as strided. */
2462 STMT_VINFO_STRIDED_P (rest
) = 1;
2463 DR_GROUP_GAP (rest
) = 0;
2464 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2465 DR_GROUP_GAP (stmt_info
) = 0;
2467 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2468 kind
, max_tree_size
, limit
);
2469 if (i
+ 1 < group_size
)
2470 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2471 rest
, kind
, max_tree_size
, limit
);
2476 /* Even though the first vector did not all match, we might be able to SLP
2477 (some) of the remainder. FORNOW ignore this possibility. */
2480 /* Failed to SLP. */
2481 if (dump_enabled_p ())
2482 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2487 /* Analyze an SLP instance starting from a group of grouped stores. Call
2488 vect_build_slp_tree to build a tree of packed stmts if possible.
2489 Return FALSE if it's impossible to SLP any stmt in the loop. */
2492 vect_analyze_slp_instance (vec_info
*vinfo
,
2493 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2494 stmt_vec_info stmt_info
,
2495 slp_instance_kind kind
,
2496 unsigned max_tree_size
, unsigned *limit
)
2499 vec
<stmt_vec_info
> scalar_stmts
;
2501 if (is_a
<bb_vec_info
> (vinfo
))
2502 vect_location
= stmt_info
->stmt
;
2504 stmt_vec_info next_info
= stmt_info
;
2505 if (kind
== slp_inst_kind_store
)
2507 /* Collect the stores and store them in scalar_stmts. */
2508 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2511 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2512 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2515 else if (kind
== slp_inst_kind_reduc_chain
)
2517 /* Collect the reduction stmts and store them in scalar_stmts. */
2518 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2521 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2522 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2524 /* Mark the first element of the reduction chain as reduction to properly
2525 transform the node. In the reduction analysis phase only the last
2526 element of the chain is marked as reduction. */
2527 STMT_VINFO_DEF_TYPE (stmt_info
)
2528 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2529 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2530 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2532 else if (kind
== slp_inst_kind_ctor
)
2534 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2536 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2537 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2539 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2540 def_info
= vect_stmt_to_vectorize (def_info
);
2541 scalar_stmts
.quick_push (def_info
);
2543 if (dump_enabled_p ())
2544 dump_printf_loc (MSG_NOTE
, vect_location
,
2545 "Analyzing vectorizable constructor: %G\n",
2548 else if (kind
== slp_inst_kind_reduc_group
)
2550 /* Collect reduction statements. */
2551 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2552 scalar_stmts
.create (reductions
.length ());
2553 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2554 if (STMT_VINFO_RELEVANT_P (next_info
)
2555 || STMT_VINFO_LIVE_P (next_info
))
2556 scalar_stmts
.quick_push (next_info
);
2557 /* If less than two were relevant/live there's nothing to SLP. */
2558 if (scalar_stmts
.length () < 2)
2564 /* Build the tree for the SLP instance. */
2565 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2566 kind
== slp_inst_kind_ctor
2568 max_tree_size
, limit
, bst_map
,
2569 kind
== slp_inst_kind_store
2570 ? stmt_info
: NULL
);
2572 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2573 where we should do store group splitting. */
2578 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2579 trees of packed scalar stmts if SLP is possible. */
2582 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2585 stmt_vec_info first_element
;
2587 DUMP_VECT_SCOPE ("vect_analyze_slp");
2589 unsigned limit
= max_tree_size
;
2591 scalar_stmts_to_slp_tree_map_t
*bst_map
2592 = new scalar_stmts_to_slp_tree_map_t ();
2594 /* Find SLP sequences starting from groups of grouped stores. */
2595 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2596 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2597 STMT_VINFO_GROUPED_ACCESS (first_element
)
2598 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2599 max_tree_size
, &limit
);
2601 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2603 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2605 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2606 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2607 bb_vinfo
->roots
[i
].stmts
,
2608 bb_vinfo
->roots
[i
].root
,
2609 max_tree_size
, &limit
, bst_map
, NULL
))
2610 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2614 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2616 /* Find SLP sequences starting from reduction chains. */
2617 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2618 if (! STMT_VINFO_RELEVANT_P (first_element
)
2619 && ! STMT_VINFO_LIVE_P (first_element
))
2621 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2622 slp_inst_kind_reduc_chain
,
2623 max_tree_size
, &limit
))
2625 /* Dissolve reduction chain group. */
2626 stmt_vec_info vinfo
= first_element
;
2627 stmt_vec_info last
= NULL
;
2630 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2631 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2632 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2636 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2637 /* It can be still vectorized as part of an SLP reduction. */
2638 loop_vinfo
->reductions
.safe_push (last
);
2641 /* Find SLP sequences starting from groups of reductions. */
2642 if (loop_vinfo
->reductions
.length () > 1)
2643 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2644 slp_inst_kind_reduc_group
, max_tree_size
,
2648 /* The map keeps a reference on SLP nodes built, release that. */
2649 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2650 it
!= bst_map
->end (); ++it
)
2652 vect_free_slp_tree ((*it
).second
);
2655 return opt_result::success ();
2658 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2661 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2662 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2667 if (visited
.add (node
))
2670 node
->vertex
= vertices
.length ();
2671 vertices
.safe_push (node
);
2674 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2678 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2681 leafs
.safe_push (node
->vertex
);
2684 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2687 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2690 hash_set
<slp_tree
> visited
;
2692 slp_instance instance
;
2693 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2695 unsigned n_v
= vertices
.length ();
2696 unsigned n_l
= leafs
.length ();
2697 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2699 /* If we added vertices but no entries to the reverse graph we've
2700 added a cycle that is not backwards-reachable. Push the entry
2701 to mimic as leaf then. */
2702 if (vertices
.length () > n_v
2703 && leafs
.length () == n_l
)
2704 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2708 /* Apply (reverse) bijectite PERM to VEC. */
2712 vect_slp_permute (vec
<unsigned> perm
,
2713 vec
<T
> &vec
, bool reverse
)
2715 auto_vec
<T
, 64> saved
;
2716 saved
.create (vec
.length ());
2717 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2718 saved
.quick_push (vec
[i
]);
2722 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2723 vec
[perm
[i
]] = saved
[i
];
2724 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2725 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2729 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2730 vec
[i
] = saved
[perm
[i
]];
2731 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2732 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2736 /* Return whether permutations PERM_A and PERM_B as recorded in the
2737 PERMS vector are equal. */
2740 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2741 int perm_a
, int perm_b
)
2743 return (perm_a
== perm_b
2744 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2745 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2746 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2749 /* Optimize the SLP graph of VINFO. */
2752 vect_optimize_slp (vec_info
*vinfo
)
2754 if (vinfo
->slp_instances
.is_empty ())
2759 auto_vec
<slp_tree
> vertices
;
2760 auto_vec
<int> leafs
;
2761 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
2763 struct graph
*slpg
= new_graph (vertices
.length ());
2764 FOR_EACH_VEC_ELT (vertices
, i
, node
)
2768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2770 add_edge (slpg
, i
, child
->vertex
);
2773 /* Compute (reverse) postorder on the inverted graph. */
2775 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
2777 auto_sbitmap
n_visited (vertices
.length ());
2778 auto_sbitmap
n_materialize (vertices
.length ());
2779 auto_vec
<int> n_perm (vertices
.length ());
2780 auto_vec
<vec
<unsigned> > perms
;
2782 bitmap_clear (n_visited
);
2783 bitmap_clear (n_materialize
);
2784 n_perm
.quick_grow_cleared (vertices
.length ());
2785 perms
.safe_push (vNULL
); /* zero is no permute */
2787 /* Produce initial permutations. */
2788 for (i
= 0; i
< leafs
.length (); ++i
)
2791 slp_tree node
= vertices
[idx
];
2793 /* Handle externals and constants optimistically throughout the
2795 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
2796 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
2799 /* Leafs do not change across iterations. Note leafs also double
2800 as entries to the reverse graph. */
2801 if (!slpg
->vertices
[idx
].succ
)
2802 bitmap_set_bit (n_visited
, idx
);
2803 /* Loads are the only thing generating permutes. */
2804 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2807 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2808 node unpermuted, record this permute. */
2809 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
2810 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
2812 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
2813 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
2814 bool any_permute
= false;
2815 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2817 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
2818 imin
= MIN (imin
, idx
);
2819 imax
= MAX (imax
, idx
);
2820 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
2823 /* If there's no permute no need to split one out. */
2826 /* If the span doesn't match we'd disrupt VF computation, avoid
2828 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
2831 /* For now only handle true permutes, like
2832 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2833 when permuting constants and invariants keeping the permute
2835 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
2836 bitmap_clear (load_index
);
2837 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2838 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
2840 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2841 if (!bitmap_bit_p (load_index
, j
))
2843 if (j
!= SLP_TREE_LANES (node
))
2846 vec
<unsigned> perm
= vNULL
;
2847 perm
.safe_grow (SLP_TREE_LANES (node
), true);
2848 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2849 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
2850 perms
.safe_push (perm
);
2851 n_perm
[idx
] = perms
.length () - 1;
2854 /* Propagate permutes along the graph and compute materialization points. */
2856 unsigned iteration
= 0;
2862 for (i
= vertices
.length (); i
> 0 ; --i
)
2865 slp_tree node
= vertices
[idx
];
2866 /* For leafs there's nothing to do - we've seeded permutes
2868 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2871 bitmap_set_bit (n_visited
, idx
);
2873 /* We cannot move a permute across a store. */
2874 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2876 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
2880 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
2881 succ
; succ
= succ
->succ_next
)
2883 int succ_idx
= succ
->dest
;
2884 /* Handle unvisited nodes optimistically. */
2885 /* ??? But for constants once we want to handle non-bijective
2886 permutes we have to verify the permute, when unifying lanes,
2887 will not unify different constants. For example see
2888 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2889 if (!bitmap_bit_p (n_visited
, succ_idx
))
2891 int succ_perm
= n_perm
[succ_idx
];
2892 /* Once we materialize succs permutation its output lanes
2893 appear unpermuted to us. */
2894 if (bitmap_bit_p (n_materialize
, succ_idx
))
2898 else if (succ_perm
== 0)
2903 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
2911 /* Pick up pre-computed leaf values. */
2913 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
2916 /* Make sure we eventually converge. */
2917 gcc_checking_assert (perm
== 0);
2920 bitmap_clear_bit (n_materialize
, idx
);
2927 /* Elide pruning at materialization points in the first
2928 iteration so every node was visited once at least. */
2932 /* Decide on permute materialization. Look whether there's
2933 a use (pred) edge that is permuted differently than us.
2934 In that case mark ourselves so the permutation is applied. */
2935 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
2936 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
2937 pred
; pred
= pred
->pred_next
)
2939 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
2940 int pred_perm
= n_perm
[pred
->src
];
2941 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
2943 all_preds_permuted
= false;
2947 if (!all_preds_permuted
)
2949 if (!bitmap_bit_p (n_materialize
, idx
))
2951 bitmap_set_bit (n_materialize
, idx
);
2955 while (changed
|| iteration
== 1);
2958 for (i
= 0; i
< vertices
.length (); ++i
)
2960 int perm
= n_perm
[i
];
2964 slp_tree node
= vertices
[i
];
2966 /* First permute invariant/external original successors. */
2969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2971 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2974 /* If the vector is uniform there's nothing to do. */
2975 if (vect_slp_tree_uniform_p (child
))
2978 /* We can end up sharing some externals via two_operator
2979 handling. Be prepared to unshare those. */
2980 if (child
->refcnt
!= 1)
2982 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
2983 SLP_TREE_CHILDREN (node
)[j
] = child
2984 = vect_create_new_slp_node
2985 (SLP_TREE_SCALAR_OPS (child
).copy ());
2987 vect_slp_permute (perms
[perm
],
2988 SLP_TREE_SCALAR_OPS (child
), true);
2991 if (bitmap_bit_p (n_materialize
, i
))
2993 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2994 /* For loads simply drop the permutation, the load permutation
2995 already performs the desired permutation. */
2997 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
2999 /* If the node if already a permute node we just need to apply
3000 the permutation to the permute node itself. */
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE
, vect_location
,
3003 "simplifying permute node %p\n",
3006 vect_slp_permute (perms
[perm
], SLP_TREE_LANE_PERMUTATION (node
),
3011 if (dump_enabled_p ())
3012 dump_printf_loc (MSG_NOTE
, vect_location
,
3013 "inserting permute node in place of %p\n",
3016 /* Make a copy of NODE and in-place change it to a
3017 VEC_PERM node to permute the lanes of the copy. */
3018 slp_tree copy
= new _slp_tree
;
3019 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3020 SLP_TREE_CHILDREN (node
) = vNULL
;
3021 SLP_TREE_SCALAR_STMTS (copy
)
3022 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3023 vect_slp_permute (perms
[perm
],
3024 SLP_TREE_SCALAR_STMTS (copy
), true);
3025 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3026 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3027 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3028 SLP_TREE_LANE_PERMUTATION (copy
)
3029 = SLP_TREE_LANE_PERMUTATION (node
);
3030 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3031 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3033 copy
->max_nunits
= node
->max_nunits
;
3034 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3035 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3036 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3038 /* Now turn NODE into a VEC_PERM. */
3039 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3040 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3041 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3042 SLP_TREE_LANE_PERMUTATION (node
)
3043 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3044 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3049 /* Apply the reverse permutation to our stmts. */
3050 vect_slp_permute (perms
[perm
],
3051 SLP_TREE_SCALAR_STMTS (node
), true);
3052 /* And to the load permutation, which we can simply
3053 make regular by design. */
3054 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3056 /* ??? When we handle non-bijective permutes the idea
3057 is that we can force the load-permutation to be
3058 { min, min + 1, min + 2, ... max }. But then the
3059 scalar defs might no longer match the lane content
3060 which means wrong-code with live lane vectorization.
3061 So we possibly have to have NULL entries for those. */
3062 vect_slp_permute (perms
[perm
],
3063 SLP_TREE_LOAD_PERMUTATION (node
), true);
3068 /* Free the perms vector used for propagation. */
3069 while (!perms
.is_empty ())
3070 perms
.pop ().release ();
3074 /* Now elide load permutations that are not necessary. */
3075 for (i
= 0; i
< leafs
.length (); ++i
)
3077 node
= vertices
[leafs
[i
]];
3078 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3081 /* In basic block vectorization we allow any subchain of an interleaving
3083 FORNOW: not in loop SLP because of realignment complications. */
3084 if (is_a
<bb_vec_info
> (vinfo
))
3086 bool subchain_p
= true;
3087 stmt_vec_info next_load_info
= NULL
;
3088 stmt_vec_info load_info
;
3090 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3093 && (next_load_info
!= load_info
3094 || DR_GROUP_GAP (load_info
) != 1))
3099 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3103 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3109 stmt_vec_info load_info
;
3110 bool this_load_permuted
= false;
3112 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3113 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3115 this_load_permuted
= true;
3118 stmt_vec_info first_stmt_info
3119 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3120 if (!this_load_permuted
3121 /* The load requires permutation when unrolling exposes
3122 a gap either because the group is larger than the SLP
3123 group-size or because there is a gap between the groups. */
3124 && (known_eq (LOOP_VINFO_VECT_FACTOR
3125 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3126 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3127 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3129 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3136 /* Gather loads reachable from the individual SLP graph entries. */
3139 vect_gather_slp_loads (vec_info
*vinfo
)
3142 slp_instance instance
;
3143 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3145 hash_set
<slp_tree
> visited
;
3146 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3147 SLP_INSTANCE_TREE (instance
), visited
);
3152 /* For each possible SLP instance decide whether to SLP it and calculate overall
3153 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3154 least one instance. */
3157 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3160 poly_uint64 unrolling_factor
= 1;
3161 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3162 slp_instance instance
;
3163 int decided_to_slp
= 0;
3165 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3167 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3169 /* FORNOW: SLP if you can. */
3170 /* All unroll factors have the form:
3172 GET_MODE_SIZE (vinfo->vector_mode) * X
3174 for some rational X, so they must have a common multiple. */
3176 = force_common_multiple (unrolling_factor
,
3177 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3179 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3180 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3181 loop-based vectorization. Such stmts will be marked as HYBRID. */
3182 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3186 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3188 if (decided_to_slp
&& dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE
, vect_location
,
3191 "Decided to SLP %d instances. Unrolling factor ",
3193 dump_dec (MSG_NOTE
, unrolling_factor
);
3194 dump_printf (MSG_NOTE
, "\n");
3197 return (decided_to_slp
> 0);
3200 /* Private data for vect_detect_hybrid_slp. */
3203 loop_vec_info loop_vinfo
;
3204 vec
<stmt_vec_info
> *worklist
;
3207 /* Walker for walk_gimple_op. */
3210 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3212 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3213 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3218 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3221 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3222 if (PURE_SLP_STMT (def_stmt_info
))
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3226 def_stmt_info
->stmt
);
3227 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3228 dat
->worklist
->safe_push (def_stmt_info
);
3234 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3235 if so, otherwise pushing it to WORKLIST. */
3238 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3239 vec
<stmt_vec_info
> &worklist
,
3240 stmt_vec_info stmt_info
)
3242 if (dump_enabled_p ())
3243 dump_printf_loc (MSG_NOTE
, vect_location
,
3244 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3245 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3246 imm_use_iterator iter2
;
3248 use_operand_p use_p
;
3249 def_operand_p def_p
;
3250 bool any_def
= false;
3251 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3254 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3256 if (is_gimple_debug (USE_STMT (use_p
)))
3258 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3259 /* An out-of loop use means this is a loop_vect sink. */
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_NOTE
, vect_location
,
3264 "Found loop_vect sink: %G", stmt_info
->stmt
);
3265 worklist
.safe_push (stmt_info
);
3268 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3270 if (dump_enabled_p ())
3271 dump_printf_loc (MSG_NOTE
, vect_location
,
3272 "Found loop_vect use: %G", use_info
->stmt
);
3273 worklist
.safe_push (stmt_info
);
3278 /* No def means this is a loo_vect sink. */
3281 if (dump_enabled_p ())
3282 dump_printf_loc (MSG_NOTE
, vect_location
,
3283 "Found loop_vect sink: %G", stmt_info
->stmt
);
3284 worklist
.safe_push (stmt_info
);
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE
, vect_location
,
3289 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3290 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3293 /* Find stmts that must be both vectorized and SLPed. */
3296 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3298 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3300 /* All stmts participating in SLP are marked pure_slp, all other
3301 stmts are loop_vect.
3302 First collect all loop_vect stmts into a worklist.
3303 SLP patterns cause not all original scalar stmts to appear in
3304 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3305 Rectify this here and do a backward walk over the IL only considering
3306 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3307 mark them as pure_slp. */
3308 auto_vec
<stmt_vec_info
> worklist
;
3309 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3311 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3312 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3315 gphi
*phi
= gsi
.phi ();
3316 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3317 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3318 maybe_push_to_hybrid_worklist (loop_vinfo
,
3319 worklist
, stmt_info
);
3321 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3324 gimple
*stmt
= gsi_stmt (gsi
);
3325 if (is_gimple_debug (stmt
))
3327 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3328 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3330 for (gimple_stmt_iterator gsi2
3331 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3332 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3334 stmt_vec_info patt_info
3335 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3336 if (!STMT_SLP_TYPE (patt_info
)
3337 && STMT_VINFO_RELEVANT (patt_info
))
3338 maybe_push_to_hybrid_worklist (loop_vinfo
,
3339 worklist
, patt_info
);
3341 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3343 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3344 maybe_push_to_hybrid_worklist (loop_vinfo
,
3345 worklist
, stmt_info
);
3349 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3350 mark any SLP vectorized stmt as hybrid.
3351 ??? We're visiting def stmts N times (once for each non-SLP and
3352 once for each hybrid-SLP use). */
3355 dat
.worklist
= &worklist
;
3356 dat
.loop_vinfo
= loop_vinfo
;
3357 memset (&wi
, 0, sizeof (wi
));
3358 wi
.info
= (void *)&dat
;
3359 while (!worklist
.is_empty ())
3361 stmt_vec_info stmt_info
= worklist
.pop ();
3362 /* Since SSA operands are not set up for pattern stmts we need
3363 to use walk_gimple_op. */
3365 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3370 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3372 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3373 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3375 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3378 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3381 gphi
*phi
= si
.phi ();
3382 gimple_set_uid (phi
, 0);
3385 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3386 !gsi_end_p (gsi
); gsi_next (&gsi
))
3388 gimple
*stmt
= gsi_stmt (gsi
);
3389 gimple_set_uid (stmt
, 0);
3390 if (is_gimple_debug (stmt
))
3398 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3399 stmts in the basic block. */
3401 _bb_vec_info::~_bb_vec_info ()
3403 /* Reset region marker. */
3404 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3407 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3410 gphi
*phi
= si
.phi ();
3411 gimple_set_uid (phi
, -1);
3413 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3414 !gsi_end_p (gsi
); gsi_next (&gsi
))
3416 gimple
*stmt
= gsi_stmt (gsi
);
3417 gimple_set_uid (stmt
, -1);
3421 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3422 roots
[i
].stmts
.release ();
3426 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3427 given then that child nodes have already been processed, and that
3428 their def types currently match their SLP node's def type. */
3431 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3432 slp_instance node_instance
,
3433 stmt_vector_for_cost
*cost_vec
)
3435 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3436 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3438 /* Calculate the number of vector statements to be created for the
3439 scalar stmts in this node. For SLP reductions it is equal to the
3440 number of vector statements in the children (which has already been
3441 calculated by the recursive call). Otherwise it is the number of
3442 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3443 VF divided by the number of elements in a vector. */
3444 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3445 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3447 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3448 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3450 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3451 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3458 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3459 vf
= loop_vinfo
->vectorization_factor
;
3462 unsigned int group_size
= SLP_TREE_LANES (node
);
3463 tree vectype
= SLP_TREE_VECTYPE (node
);
3464 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3465 = vect_get_num_vectors (vf
* group_size
, vectype
);
3468 /* Handle purely internal nodes. */
3469 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3470 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3472 if (is_a
<bb_vec_info
> (vinfo
)
3473 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3475 if (dump_enabled_p ())
3476 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3477 "desired vector type conflicts with earlier one "
3478 "for %G", stmt_info
->stmt
);
3483 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3484 node
, node_instance
, cost_vec
);
3487 /* Try to build NODE from scalars, returning true on success.
3488 NODE_INSTANCE is the SLP instance that contains NODE. */
3491 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3492 slp_instance node_instance
)
3494 stmt_vec_info stmt_info
;
3497 if (!is_a
<bb_vec_info
> (vinfo
)
3498 || node
== SLP_INSTANCE_TREE (node_instance
)
3499 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3500 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_NOTE
, vect_location
,
3505 "Building vector operands of %p from scalars instead\n", node
);
3507 /* Don't remove and free the child nodes here, since they could be
3508 referenced by other structures. The analysis and scheduling phases
3509 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3510 unsigned int group_size
= SLP_TREE_LANES (node
);
3511 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3512 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3513 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3514 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3516 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3517 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3522 /* Compute the prologue cost for invariant or constant operands represented
3526 vect_prologue_cost_for_slp (slp_tree node
,
3527 stmt_vector_for_cost
*cost_vec
)
3529 /* There's a special case of an existing vector, that costs nothing. */
3530 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3531 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3533 /* Without looking at the actual initializer a vector of
3534 constants can be implemented as load from the constant pool.
3535 When all elements are the same we can use a splat. */
3536 tree vectype
= SLP_TREE_VECTYPE (node
);
3537 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3538 unsigned num_vects_to_check
;
3539 unsigned HOST_WIDE_INT const_nunits
;
3540 unsigned nelt_limit
;
3541 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3542 && ! multiple_p (const_nunits
, group_size
))
3544 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3545 nelt_limit
= const_nunits
;
3549 /* If either the vector has variable length or the vectors
3550 are composed of repeated whole groups we only need to
3551 cost construction once. All vectors will be the same. */
3552 num_vects_to_check
= 1;
3553 nelt_limit
= group_size
;
3555 tree elt
= NULL_TREE
;
3557 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3559 unsigned si
= j
% group_size
;
3561 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3562 /* ??? We're just tracking whether all operands of a single
3563 vector initializer are the same, ideally we'd check if
3564 we emitted the same one already. */
3565 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3568 if (nelt
== nelt_limit
)
3570 record_stmt_cost (cost_vec
, 1,
3571 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3572 ? (elt
? scalar_to_vec
: vec_construct
)
3574 NULL
, vectype
, 0, vect_prologue
);
3580 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3581 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3583 Return true if the operations are supported. */
3586 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3587 slp_instance node_instance
,
3588 hash_set
<slp_tree
> &visited_set
,
3589 vec
<slp_tree
> &visited_vec
,
3590 stmt_vector_for_cost
*cost_vec
)
3595 /* Assume we can code-generate all invariants. */
3597 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3598 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3601 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3603 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_NOTE
, vect_location
,
3605 "Failed cyclic SLP reference in %p", node
);
3608 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3610 /* If we already analyzed the exact same set of scalar stmts we're done.
3611 We share the generated vector stmts for those. */
3612 if (visited_set
.add (node
))
3614 visited_vec
.safe_push (node
);
3617 unsigned visited_rec_start
= visited_vec
.length ();
3618 unsigned cost_vec_rec_start
= cost_vec
->length ();
3619 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3621 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3622 visited_set
, visited_vec
,
3629 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3631 /* If analysis failed we have to pop all recursive visited nodes
3635 while (visited_vec
.length () >= visited_rec_start
)
3636 visited_set
.remove (visited_vec
.pop ());
3637 cost_vec
->truncate (cost_vec_rec_start
);
3640 /* When the node can be vectorized cost invariant nodes it references.
3641 This is not done in DFS order to allow the refering node
3642 vectorizable_* calls to nail down the invariant nodes vector type
3643 and possibly unshare it if it needs a different vector type than
3646 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3648 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3649 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3650 /* Perform usual caching, note code-generation still
3651 code-gens these nodes multiple times but we expect
3652 to CSE them later. */
3653 && !visited_set
.add (child
))
3655 visited_vec
.safe_push (child
);
3656 /* ??? After auditing more code paths make a "default"
3657 and push the vector type from NODE to all children
3658 if it is not already set. */
3659 /* Compute the number of vectors to be generated. */
3660 tree vector_type
= SLP_TREE_VECTYPE (child
);
3663 /* For shifts with a scalar argument we don't need
3664 to cost or code-generate anything.
3665 ??? Represent this more explicitely. */
3666 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3667 == shift_vec_info_type
)
3671 unsigned group_size
= SLP_TREE_LANES (child
);
3673 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3674 vf
= loop_vinfo
->vectorization_factor
;
3675 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3676 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3677 /* And cost them. */
3678 vect_prologue_cost_for_slp (child
, cost_vec
);
3681 /* If this node or any of its children can't be vectorized, try pruning
3682 the tree here rather than felling the whole thing. */
3683 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3685 /* We'll need to revisit this for invariant costing and number
3686 of vectorized stmt setting. */
3694 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3695 region and that can be vectorized using vectorizable_live_operation
3696 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3697 scalar code computing it to be retained. */
3700 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3701 slp_instance instance
,
3702 stmt_vector_for_cost
*cost_vec
,
3703 hash_set
<stmt_vec_info
> &svisited
,
3704 hash_set
<slp_tree
> &visited
)
3706 if (visited
.add (node
))
3710 stmt_vec_info stmt_info
;
3711 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3712 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3714 if (svisited
.contains (stmt_info
))
3716 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3717 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3718 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3719 /* Only the pattern root stmt computes the original scalar value. */
3721 bool mark_visited
= true;
3722 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3723 ssa_op_iter op_iter
;
3724 def_operand_p def_p
;
3725 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3727 imm_use_iterator use_iter
;
3729 stmt_vec_info use_stmt_info
;
3730 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3731 if (!is_gimple_debug (use_stmt
))
3733 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3735 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3737 STMT_VINFO_LIVE_P (stmt_info
) = true;
3738 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3739 NULL
, node
, instance
, i
,
3741 /* ??? So we know we can vectorize the live stmt
3742 from one SLP node. If we cannot do so from all
3743 or none consistently we'd have to record which
3744 SLP node (and lane) we want to use for the live
3745 operation. So make sure we can code-generate
3747 mark_visited
= false;
3749 STMT_VINFO_LIVE_P (stmt_info
) = false;
3750 BREAK_FROM_IMM_USE_STMT (use_iter
);
3753 /* We have to verify whether we can insert the lane extract
3754 before all uses. The following is a conservative approximation.
3755 We cannot put this into vectorizable_live_operation because
3756 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3758 Note that while the fact that we emit code for loads at the
3759 first load should make this a non-problem leafs we construct
3760 from scalars are vectorized after the last scalar def.
3761 ??? If we'd actually compute the insert location during
3762 analysis we could use sth less conservative than the last
3763 scalar stmt in the node for the dominance check. */
3764 /* ??? What remains is "live" uses in vector CTORs in the same
3765 SLP graph which is where those uses can end up code-generated
3766 right after their definition instead of close to their original
3767 use. But that would restrict us to code-generate lane-extracts
3768 from the latest stmt in a node. So we compensate for this
3769 during code-generation, simply not replacing uses for those
3770 hopefully rare cases. */
3771 if (STMT_VINFO_LIVE_P (stmt_info
))
3772 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3773 if (!is_gimple_debug (use_stmt
)
3774 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3775 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3776 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3778 if (dump_enabled_p ())
3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3780 "Cannot determine insertion place for "
3782 STMT_VINFO_LIVE_P (stmt_info
) = false;
3783 mark_visited
= true;
3787 svisited
.add (stmt_info
);
3791 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3792 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3793 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3794 cost_vec
, svisited
, visited
);
3797 /* Analyze statements in SLP instances of VINFO. Return true if the
3798 operations are supported. */
3801 vect_slp_analyze_operations (vec_info
*vinfo
)
3803 slp_instance instance
;
3806 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3808 hash_set
<slp_tree
> visited
;
3809 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3811 auto_vec
<slp_tree
> visited_vec
;
3812 stmt_vector_for_cost cost_vec
;
3813 cost_vec
.create (2);
3814 if (is_a
<bb_vec_info
> (vinfo
))
3815 vect_location
= instance
->location ();
3816 if (!vect_slp_analyze_node_operations (vinfo
,
3817 SLP_INSTANCE_TREE (instance
),
3818 instance
, visited
, visited_vec
,
3820 /* Instances with a root stmt require vectorized defs for the
3822 || (SLP_INSTANCE_ROOT_STMT (instance
)
3823 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3824 != vect_internal_def
)))
3826 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3827 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3828 if (dump_enabled_p ())
3829 dump_printf_loc (MSG_NOTE
, vect_location
,
3830 "removing SLP instance operations starting from: %G",
3832 vect_free_slp_instance (instance
);
3833 vinfo
->slp_instances
.ordered_remove (i
);
3834 cost_vec
.release ();
3835 while (!visited_vec
.is_empty ())
3836 visited
.remove (visited_vec
.pop ());
3842 /* For BB vectorization remember the SLP graph entry
3844 if (is_a
<bb_vec_info
> (vinfo
))
3845 instance
->cost_vec
= cost_vec
;
3848 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3849 cost_vec
.release ();
3854 /* Compute vectorizable live stmts. */
3855 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3857 hash_set
<stmt_vec_info
> svisited
;
3858 hash_set
<slp_tree
> visited
;
3859 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3861 vect_location
= instance
->location ();
3862 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3863 instance
, &instance
->cost_vec
, svisited
,
3868 return !vinfo
->slp_instances
.is_empty ();
3871 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3872 closing the eventual chain. */
3875 get_ultimate_leader (slp_instance instance
,
3876 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3878 auto_vec
<slp_instance
*, 8> chain
;
3880 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3882 chain
.safe_push (tem
);
3885 while (!chain
.is_empty ())
3886 *chain
.pop () = instance
;
3890 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3893 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3894 slp_instance instance
, slp_tree node
,
3895 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3896 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
3897 hash_set
<slp_tree
> &visited
)
3899 stmt_vec_info stmt_info
;
3902 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3905 slp_instance
&stmt_instance
3906 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3909 else if (stmt_instance
!= instance
)
3911 /* If we're running into a previously marked stmt make us the
3912 leader of the current ultimate leader. This keeps the
3913 leader chain acyclic and works even when the current instance
3914 connects two previously independent graph parts. */
3915 slp_instance stmt_leader
3916 = get_ultimate_leader (stmt_instance
, instance_leader
);
3917 if (stmt_leader
!= instance
)
3918 instance_leader
.put (stmt_leader
, instance
);
3920 stmt_instance
= instance
;
3923 if (visited
.add (node
))
3927 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3928 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3929 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3930 instance_leader
, visited
);
3933 /* Partition the SLP graph into pieces that can be costed independently. */
3936 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3938 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3940 /* First walk the SLP graph assigning each involved scalar stmt a
3941 corresponding SLP graph entry and upon visiting a previously
3942 marked stmt, make the stmts leader the current SLP graph entry. */
3943 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3944 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3945 hash_set
<slp_tree
> visited
;
3946 slp_instance instance
;
3947 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3949 instance_leader
.put (instance
, instance
);
3950 vect_bb_partition_graph_r (bb_vinfo
,
3951 instance
, SLP_INSTANCE_TREE (instance
),
3952 stmt_to_instance
, instance_leader
,
3956 /* Then collect entries to each independent subgraph. */
3957 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3959 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3960 leader
->subgraph_entries
.safe_push (instance
);
3961 if (dump_enabled_p ()
3962 && leader
!= instance
)
3963 dump_printf_loc (MSG_NOTE
, vect_location
,
3964 "instance %p is leader of %p\n",
3969 /* Compute the scalar cost of the SLP node NODE and its children
3970 and return it. Do not account defs that are marked in LIFE and
3971 update LIFE according to uses of NODE. */
3974 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3975 slp_tree node
, vec
<bool, va_heap
> *life
,
3976 stmt_vector_for_cost
*cost_vec
,
3977 hash_set
<slp_tree
> &visited
)
3980 stmt_vec_info stmt_info
;
3983 if (visited
.add (node
))
3986 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3988 ssa_op_iter op_iter
;
3989 def_operand_p def_p
;
3994 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3995 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3997 /* If there is a non-vectorized use of the defs then the scalar
3998 stmt is kept live in which case we do not account it or any
3999 required defs in the SLP children in the scalar cost. This
4000 way we make the vectorization more costly when compared to
4002 if (!STMT_VINFO_LIVE_P (stmt_info
))
4004 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4006 imm_use_iterator use_iter
;
4008 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4009 if (!is_gimple_debug (use_stmt
))
4011 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4014 (vect_stmt_to_vectorize (use_stmt_info
)))
4017 BREAK_FROM_IMM_USE_STMT (use_iter
);
4025 /* Count scalar stmts only once. */
4026 if (gimple_visited_p (orig_stmt
))
4028 gimple_set_visited (orig_stmt
, true);
4030 vect_cost_for_stmt kind
;
4031 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4033 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4036 kind
= scalar_store
;
4038 else if (vect_nop_conversion_p (orig_stmt_info
))
4042 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4043 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4046 auto_vec
<bool, 20> subtree_life
;
4047 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4049 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4051 /* Do not directly pass LIFE to the recursive call, copy it to
4052 confine changes in the callee to the current child/subtree. */
4053 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4055 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4056 for (unsigned j
= 0;
4057 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4059 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4060 if (perm
.first
== i
)
4061 subtree_life
[perm
.second
] = (*life
)[j
];
4066 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4067 subtree_life
.safe_splice (*life
);
4069 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4071 subtree_life
.truncate (0);
4076 /* Check if vectorization of the basic block is profitable for the
4077 subgraph denoted by SLP_INSTANCES. */
4080 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4081 vec
<slp_instance
> slp_instances
)
4083 slp_instance instance
;
4085 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4086 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4088 void *vect_target_cost_data
= init_cost (NULL
);
4090 /* Calculate scalar cost and sum the cost for the vector stmts
4091 previously collected. */
4092 stmt_vector_for_cost scalar_costs
;
4093 scalar_costs
.create (0);
4094 hash_set
<slp_tree
> visited
;
4095 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4097 auto_vec
<bool, 20> life
;
4098 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4100 vect_bb_slp_scalar_cost (bb_vinfo
,
4101 SLP_INSTANCE_TREE (instance
),
4102 &life
, &scalar_costs
, visited
);
4103 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
4104 instance
->cost_vec
.release ();
4106 /* Unset visited flag. */
4107 stmt_info_for_cost
*si
;
4108 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
4109 gimple_set_visited (si
->stmt_info
->stmt
, false);
4111 void *scalar_target_cost_data
= init_cost (NULL
);
4112 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
4113 scalar_costs
.release ();
4115 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4116 destroy_cost_data (scalar_target_cost_data
);
4118 /* Complete the target-specific vector cost calculation. */
4119 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4120 &vec_inside_cost
, &vec_epilogue_cost
);
4121 destroy_cost_data (vect_target_cost_data
);
4123 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4125 if (dump_enabled_p ())
4127 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4128 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
4130 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
4131 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
4132 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
4135 /* Vectorization is profitable if its cost is more than the cost of scalar
4136 version. Note that we err on the vector side for equal cost because
4137 the cost estimate is otherwise quite pessimistic (constant uses are
4138 free on the scalar side but cost a load on the vector side for
4140 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4146 /* qsort comparator for lane defs. */
4149 vld_cmp (const void *a_
, const void *b_
)
4151 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4152 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4153 return a
->first
- b
->first
;
4156 /* Return true if USE_STMT is a vector lane insert into VEC and set
4157 *THIS_LANE to the lane number that is set. */
4160 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4162 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4164 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4166 ? gimple_assign_rhs1 (use_ass
) != vec
4167 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4168 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4169 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4170 || !constant_multiple_p
4171 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4172 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4178 /* Find any vectorizable constructors and add them to the grouped_store
4182 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4184 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4185 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4186 !gsi_end_p (gsi
); gsi_next (&gsi
))
4188 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4192 tree rhs
= gimple_assign_rhs1 (assign
);
4193 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4195 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4196 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4197 CONSTRUCTOR_NELTS (rhs
))
4198 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4199 || uniform_vector_p (rhs
))
4204 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4205 if (TREE_CODE (val
) != SSA_NAME
4206 || !bb_vinfo
->lookup_def (val
))
4208 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4211 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4212 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4214 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4215 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4216 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4217 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4218 && integer_zerop (gimple_assign_rhs3 (assign
))
4219 && useless_type_conversion_p
4220 (TREE_TYPE (TREE_TYPE (rhs
)),
4221 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4222 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4224 /* We start to match on insert to lane zero but since the
4225 inserts need not be ordered we'd have to search both
4226 the def and the use chains. */
4227 tree vectype
= TREE_TYPE (rhs
);
4228 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4229 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4230 auto_sbitmap
lanes (nlanes
);
4231 bitmap_clear (lanes
);
4232 bitmap_set_bit (lanes
, 0);
4233 tree def
= gimple_assign_lhs (assign
);
4234 lane_defs
.quick_push
4235 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4236 unsigned lanes_found
= 1;
4237 /* Start with the use chains, the last stmt will be the root. */
4238 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4241 use_operand_p use_p
;
4243 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4246 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4247 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4248 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4250 if (bitmap_bit_p (lanes
, this_lane
))
4253 bitmap_set_bit (lanes
, this_lane
);
4254 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4255 lane_defs
.quick_push (std::make_pair
4256 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4257 last
= bb_vinfo
->lookup_stmt (use_ass
);
4258 def
= gimple_assign_lhs (use_ass
);
4260 while (lanes_found
< nlanes
);
4261 if (lanes_found
< nlanes
)
4263 /* Now search the def chain. */
4264 def
= gimple_assign_rhs1 (assign
);
4267 if (TREE_CODE (def
) != SSA_NAME
4268 || !has_single_use (def
))
4270 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4272 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4273 || !vect_slp_is_lane_insert (def_stmt
,
4274 NULL_TREE
, &this_lane
)
4275 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4277 if (bitmap_bit_p (lanes
, this_lane
))
4280 bitmap_set_bit (lanes
, this_lane
);
4281 lane_defs
.quick_push (std::make_pair
4283 gimple_assign_rhs2 (def_stmt
)));
4284 def
= gimple_assign_rhs1 (def_stmt
);
4286 while (lanes_found
< nlanes
);
4288 if (lanes_found
== nlanes
)
4290 /* Sort lane_defs after the lane index and register the root. */
4291 lane_defs
.qsort (vld_cmp
);
4292 vec
<stmt_vec_info
> stmts
;
4293 stmts
.create (nlanes
);
4294 for (unsigned i
= 0; i
< nlanes
; ++i
)
4295 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4296 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4303 /* Walk the grouped store chains and replace entries with their
4304 pattern variant if any. */
4307 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4309 stmt_vec_info first_element
;
4312 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4314 /* We also have CTORs in this array. */
4315 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4317 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4319 stmt_vec_info orig
= first_element
;
4320 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4321 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4322 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4323 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4324 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4325 vinfo
->grouped_stores
[i
] = first_element
;
4327 stmt_vec_info prev
= first_element
;
4328 while (DR_GROUP_NEXT_ELEMENT (prev
))
4330 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4331 if (STMT_VINFO_IN_PATTERN_P (elt
))
4333 stmt_vec_info orig
= elt
;
4334 elt
= STMT_VINFO_RELATED_STMT (elt
);
4335 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4336 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4337 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4339 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4345 /* Check if the region described by BB_VINFO can be vectorized, returning
4346 true if so. When returning false, set FATAL to true if the same failure
4347 would prevent vectorization at other vector sizes, false if it is still
4348 worth trying other sizes. N_STMTS is the number of statements in the
4352 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4353 vec
<int> *dataref_groups
)
4355 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4357 slp_instance instance
;
4359 poly_uint64 min_vf
= 2;
4361 /* The first group of checks is independent of the vector size. */
4364 /* Analyze the data references. */
4366 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4368 if (dump_enabled_p ())
4369 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4370 "not vectorized: unhandled data-ref in basic "
4375 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4377 if (dump_enabled_p ())
4378 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4379 "not vectorized: unhandled data access in "
4384 vect_slp_check_for_constructors (bb_vinfo
);
4386 /* If there are no grouped stores and no constructors in the region
4387 there is no need to continue with pattern recog as vect_analyze_slp
4388 will fail anyway. */
4389 if (bb_vinfo
->grouped_stores
.is_empty ()
4390 && bb_vinfo
->roots
.is_empty ())
4392 if (dump_enabled_p ())
4393 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4394 "not vectorized: no grouped stores in "
4399 /* While the rest of the analysis below depends on it in some way. */
4402 vect_pattern_recog (bb_vinfo
);
4404 /* Update store groups from pattern processing. */
4405 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4407 /* Check the SLP opportunities in the basic block, analyze and build SLP
4409 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4411 if (dump_enabled_p ())
4413 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4414 "Failed to SLP the basic block.\n");
4415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4416 "not vectorized: failed to find SLP opportunities "
4417 "in basic block.\n");
4422 /* Optimize permutations. */
4423 vect_optimize_slp (bb_vinfo
);
4425 /* Gather the loads reachable from the SLP graph entries. */
4426 vect_gather_slp_loads (bb_vinfo
);
4428 vect_record_base_alignments (bb_vinfo
);
4430 /* Analyze and verify the alignment of data references and the
4431 dependence in the SLP instances. */
4432 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4434 vect_location
= instance
->location ();
4435 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4436 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4438 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4439 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4440 if (dump_enabled_p ())
4441 dump_printf_loc (MSG_NOTE
, vect_location
,
4442 "removing SLP instance operations starting from: %G",
4444 vect_free_slp_instance (instance
);
4445 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4449 /* Mark all the statements that we want to vectorize as pure SLP and
4451 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4452 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4453 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4455 STMT_SLP_TYPE (root
) = pure_slp
;
4456 if (is_gimple_assign (root
->stmt
)
4457 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4459 /* ??? We should probably record the whole vector of
4460 root stmts so we do not have to back-track here... */
4461 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4464 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4465 STMT_SLP_TYPE (root
) = pure_slp
;
4472 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4475 if (!vect_slp_analyze_operations (bb_vinfo
))
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4479 "not vectorized: bad operation in basic block.\n");
4483 vect_bb_partition_graph (bb_vinfo
);
4488 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4489 basic blocks in BBS, returning true on success.
4490 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4493 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4494 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4496 bb_vec_info bb_vinfo
;
4497 auto_vector_modes vector_modes
;
4499 /* Autodetect first vector size we try. */
4500 machine_mode next_vector_mode
= VOIDmode
;
4501 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4502 unsigned int mode_i
= 0;
4504 vec_info_shared shared
;
4506 machine_mode autodetected_vector_mode
= VOIDmode
;
4509 bool vectorized
= false;
4511 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4513 bool first_time_p
= shared
.datarefs
.is_empty ();
4514 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4516 bb_vinfo
->shared
->save_datarefs ();
4518 bb_vinfo
->shared
->check_datarefs ();
4519 bb_vinfo
->vector_mode
= next_vector_mode
;
4521 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
4522 && dbg_cnt (vect_slp
))
4524 if (dump_enabled_p ())
4526 dump_printf_loc (MSG_NOTE
, vect_location
,
4527 "***** Analysis succeeded with vector mode"
4528 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4529 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4532 bb_vinfo
->shared
->check_datarefs ();
4535 slp_instance instance
;
4536 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4538 if (instance
->subgraph_entries
.is_empty ())
4541 vect_location
= instance
->location ();
4542 if (!unlimited_cost_model (NULL
)
4543 && !vect_bb_vectorization_profitable_p
4544 (bb_vinfo
, instance
->subgraph_entries
))
4546 if (dump_enabled_p ())
4547 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4548 "not vectorized: vectorization is not "
4553 if (!vectorized
&& dump_enabled_p ())
4554 dump_printf_loc (MSG_NOTE
, vect_location
,
4555 "Basic block will be vectorized "
4559 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4561 unsigned HOST_WIDE_INT bytes
;
4562 if (dump_enabled_p ())
4565 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4566 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4567 "basic block part vectorized using %wu "
4568 "byte vectors\n", bytes
);
4570 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4571 "basic block part vectorized using "
4572 "variable length vectors\n");
4578 if (dump_enabled_p ())
4579 dump_printf_loc (MSG_NOTE
, vect_location
,
4580 "***** Analysis failed with vector mode %s\n",
4581 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4585 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4588 while (mode_i
< vector_modes
.length ()
4589 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4591 if (dump_enabled_p ())
4592 dump_printf_loc (MSG_NOTE
, vect_location
,
4593 "***** The result for vector mode %s would"
4595 GET_MODE_NAME (vector_modes
[mode_i
]));
4601 if (mode_i
< vector_modes
.length ()
4602 && VECTOR_MODE_P (autodetected_vector_mode
)
4603 && (related_vector_mode (vector_modes
[mode_i
],
4604 GET_MODE_INNER (autodetected_vector_mode
))
4605 == autodetected_vector_mode
)
4606 && (related_vector_mode (autodetected_vector_mode
,
4607 GET_MODE_INNER (vector_modes
[mode_i
]))
4608 == vector_modes
[mode_i
]))
4610 if (dump_enabled_p ())
4611 dump_printf_loc (MSG_NOTE
, vect_location
,
4612 "***** Skipping vector mode %s, which would"
4613 " repeat the analysis for %s\n",
4614 GET_MODE_NAME (vector_modes
[mode_i
]),
4615 GET_MODE_NAME (autodetected_vector_mode
));
4620 || mode_i
== vector_modes
.length ()
4621 || autodetected_vector_mode
== VOIDmode
4622 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4623 vector sizes will fail do not bother iterating. */
4627 /* Try the next biggest vector size. */
4628 next_vector_mode
= vector_modes
[mode_i
++];
4629 if (dump_enabled_p ())
4630 dump_printf_loc (MSG_NOTE
, vect_location
,
4631 "***** Re-trying analysis with vector mode %s\n",
4632 GET_MODE_NAME (next_vector_mode
));
4637 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4638 true if anything in the basic-block was vectorized. */
4641 vect_slp_bbs (vec
<basic_block
> bbs
)
4643 vec
<data_reference_p
> datarefs
= vNULL
;
4644 auto_vec
<int> dataref_groups
;
4646 int current_group
= 0;
4648 for (unsigned i
= 0; i
< bbs
.length (); i
++)
4650 basic_block bb
= bbs
[i
];
4651 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
4654 gimple
*stmt
= gsi_stmt (gsi
);
4655 if (is_gimple_debug (stmt
))
4660 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
4661 vect_location
= stmt
;
4663 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
4664 &dataref_groups
, current_group
))
4669 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
4672 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4673 true if anything in the basic-block was vectorized. */
4676 vect_slp_bb (basic_block bb
)
4678 auto_vec
<basic_block
> bbs
;
4680 return vect_slp_bbs (bbs
);
4683 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4684 true if anything in the basic-block was vectorized. */
4687 vect_slp_function (function
*fun
)
4690 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
4691 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
4693 /* For the moment split the function into pieces to avoid making
4694 the iteration on the vector mode moot. Split at points we know
4695 to not handle well which is CFG merges (SLP discovery doesn't
4696 handle non-loop-header PHIs) and loop exits. Since pattern
4697 recog requires reverse iteration to visit uses before defs
4698 simply chop RPO into pieces. */
4699 auto_vec
<basic_block
> bbs
;
4700 for (unsigned i
= 0; i
< n
; i
++)
4702 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
4705 /* Split when a BB is not dominated by the first block. */
4706 if (!bbs
.is_empty ()
4707 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
4709 if (dump_enabled_p ())
4710 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4711 "splitting region at dominance boundary bb%d\n",
4715 /* Split when the loop determined by the first block
4716 is exited. This is because we eventually insert
4717 invariants at region begin. */
4718 else if (!bbs
.is_empty ()
4719 && bbs
[0]->loop_father
!= bb
->loop_father
4720 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
4722 if (dump_enabled_p ())
4723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4724 "splitting region at loop %d exit at bb%d\n",
4725 bbs
[0]->loop_father
->num
, bb
->index
);
4729 if (split
&& !bbs
.is_empty ())
4731 r
|= vect_slp_bbs (bbs
);
4733 bbs
.quick_push (bb
);
4738 /* When we have a stmt ending this block and defining a
4739 value we have to insert on edges when inserting after it for
4740 a vector containing its definition. Avoid this for now. */
4741 if (gimple
*last
= last_stmt (bb
))
4742 if (gimple_get_lhs (last
)
4743 && is_ctrl_altering_stmt (last
))
4745 if (dump_enabled_p ())
4746 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4747 "splitting region at control altering "
4748 "definition %G", last
);
4749 r
|= vect_slp_bbs (bbs
);
4754 if (!bbs
.is_empty ())
4755 r
|= vect_slp_bbs (bbs
);
4762 /* Build a variable-length vector in which the elements in ELTS are repeated
4763 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4764 RESULTS and add any new instructions to SEQ.
4766 The approach we use is:
4768 (1) Find a vector mode VM with integer elements of mode IM.
4770 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4771 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4772 from small vectors to IM.
4774 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4776 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4777 correct byte contents.
4779 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4781 We try to find the largest IM for which this sequence works, in order
4782 to cut down on the number of interleaves. */
4785 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
4786 vec
<tree
> elts
, unsigned int nresults
,
4789 unsigned int nelts
= elts
.length ();
4790 tree element_type
= TREE_TYPE (vector_type
);
4792 /* (1) Find a vector mode VM with integer elements of mode IM. */
4793 unsigned int nvectors
= 1;
4794 tree new_vector_type
;
4796 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
4797 &nvectors
, &new_vector_type
,
4801 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4802 unsigned int partial_nelts
= nelts
/ nvectors
;
4803 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
4805 tree_vector_builder partial_elts
;
4806 auto_vec
<tree
, 32> pieces (nvectors
* 2);
4807 pieces
.quick_grow (nvectors
* 2);
4808 for (unsigned int i
= 0; i
< nvectors
; ++i
)
4810 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4811 ELTS' has mode IM. */
4812 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
4813 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
4814 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
4815 tree t
= gimple_build_vector (seq
, &partial_elts
);
4816 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
4817 TREE_TYPE (new_vector_type
), t
);
4819 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4820 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
4823 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4824 correct byte contents.
4826 We need to repeat the following operation log2(nvectors) times:
4828 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4829 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4831 However, if each input repeats every N elements and the VF is
4832 a multiple of N * 2, the HI result is the same as the LO. */
4833 unsigned int in_start
= 0;
4834 unsigned int out_start
= nvectors
;
4835 unsigned int hi_start
= nvectors
/ 2;
4836 /* A bound on the number of outputs needed to produce NRESULTS results
4837 in the final iteration. */
4838 unsigned int noutputs_bound
= nvectors
* nresults
;
4839 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
4841 noutputs_bound
/= 2;
4842 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
4843 for (unsigned int i
= 0; i
< limit
; ++i
)
4846 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
4849 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
4853 tree output
= make_ssa_name (new_vector_type
);
4854 tree input1
= pieces
[in_start
+ (i
/ 2)];
4855 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
4856 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
4859 gimple_seq_add_stmt (seq
, stmt
);
4860 pieces
[out_start
+ i
] = output
;
4862 std::swap (in_start
, out_start
);
4865 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4866 results
.reserve (nresults
);
4867 for (unsigned int i
= 0; i
< nresults
; ++i
)
4869 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
4870 pieces
[in_start
+ i
]));
4872 results
.quick_push (results
[i
- nvectors
]);
4876 /* For constant and loop invariant defs in OP_NODE this function creates
4877 vector defs that will be used in the vectorized stmts and stores them
4878 to SLP_TREE_VEC_DEFS of OP_NODE. */
4881 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
4883 unsigned HOST_WIDE_INT nunits
;
4885 unsigned j
, number_of_places_left_in_vector
;
4888 int group_size
= op_node
->ops
.length ();
4889 unsigned int vec_num
, i
;
4890 unsigned number_of_copies
= 1;
4892 gimple_seq ctor_seq
= NULL
;
4893 auto_vec
<tree
, 16> permute_results
;
4895 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4896 vector_type
= SLP_TREE_VECTYPE (op_node
);
4898 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
4899 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
4900 auto_vec
<tree
> voprnds (number_of_vectors
);
4902 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4903 created vectors. It is greater than 1 if unrolling is performed.
4905 For example, we have two scalar operands, s1 and s2 (e.g., group of
4906 strided accesses of size two), while NUNITS is four (i.e., four scalars
4907 of this type can be packed in a vector). The output vector will contain
4908 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4911 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4912 containing the operands.
4914 For example, NUNITS is four as before, and the group size is 8
4915 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4916 {s5, s6, s7, s8}. */
4918 /* When using duplicate_and_interleave, we just need one element for
4919 each scalar statement. */
4920 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
4921 nunits
= group_size
;
4923 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
4925 number_of_places_left_in_vector
= nunits
;
4927 tree_vector_builder
elts (vector_type
, nunits
, 1);
4928 elts
.quick_grow (nunits
);
4929 stmt_vec_info insert_after
= NULL
;
4930 for (j
= 0; j
< number_of_copies
; j
++)
4933 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
4935 /* Create 'vect_ = {op0,op1,...,opn}'. */
4936 number_of_places_left_in_vector
--;
4938 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
4940 if (CONSTANT_CLASS_P (op
))
4942 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4944 /* Can't use VIEW_CONVERT_EXPR for booleans because
4945 of possibly different sizes of scalar value and
4947 if (integer_zerop (op
))
4948 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
4949 else if (integer_onep (op
))
4950 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
4955 op
= fold_unary (VIEW_CONVERT_EXPR
,
4956 TREE_TYPE (vector_type
), op
);
4957 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
4961 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
4963 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
4966 = build_all_ones_cst (TREE_TYPE (vector_type
));
4968 = build_zero_cst (TREE_TYPE (vector_type
));
4969 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
4970 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4976 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4979 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4982 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4986 elts
[number_of_places_left_in_vector
] = op
;
4987 if (!CONSTANT_CLASS_P (op
))
4989 /* For BB vectorization we have to compute an insert location
4990 when a def is inside the analyzed region since we cannot
4991 simply insert at the BB start in this case. */
4992 stmt_vec_info opdef
;
4993 if (TREE_CODE (orig_op
) == SSA_NAME
4994 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4995 && is_a
<bb_vec_info
> (vinfo
)
4996 && (opdef
= vinfo
->lookup_def (orig_op
)))
4999 insert_after
= opdef
;
5001 insert_after
= get_later_stmt (insert_after
, opdef
);
5004 if (number_of_places_left_in_vector
== 0)
5007 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5008 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5009 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5012 if (permute_results
.is_empty ())
5013 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5014 elts
, number_of_vectors
,
5016 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5018 if (!gimple_seq_empty_p (ctor_seq
))
5022 gimple_stmt_iterator gsi
;
5023 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5025 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5026 gsi_insert_seq_before (&gsi
, ctor_seq
,
5027 GSI_CONTINUE_LINKING
);
5029 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5031 gsi
= gsi_for_stmt (insert_after
->stmt
);
5032 gsi_insert_seq_after (&gsi
, ctor_seq
,
5033 GSI_CONTINUE_LINKING
);
5037 /* When we want to insert after a def where the
5038 defining stmt throws then insert on the fallthru
5040 edge e
= find_fallthru_edge
5041 (gimple_bb (insert_after
->stmt
)->succs
);
5043 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5044 gcc_assert (!new_bb
);
5048 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5051 voprnds
.quick_push (vec_cst
);
5052 insert_after
= NULL
;
5053 number_of_places_left_in_vector
= nunits
;
5055 elts
.new_vector (vector_type
, nunits
, 1);
5056 elts
.quick_grow (nunits
);
5061 /* Since the vectors are created in the reverse order, we should invert
5063 vec_num
= voprnds
.length ();
5064 for (j
= vec_num
; j
!= 0; j
--)
5066 vop
= voprnds
[j
- 1];
5067 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5070 /* In case that VF is greater than the unrolling factor needed for the SLP
5071 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5072 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5073 to replicate the vectors. */
5074 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5075 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5077 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5080 /* Get the Ith vectorized definition from SLP_NODE. */
5083 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5085 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5086 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5088 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5091 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5094 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5096 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5097 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5100 gimple
*vec_def_stmt
;
5101 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5102 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5105 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5108 /* Get N vectorized definitions for SLP_NODE. */
5111 vect_get_slp_defs (vec_info
*,
5112 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5115 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5117 for (unsigned i
= 0; i
< n
; ++i
)
5119 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5120 vec
<tree
> vec_defs
= vNULL
;
5121 vect_get_slp_defs (child
, &vec_defs
);
5122 vec_oprnds
->quick_push (vec_defs
);
5126 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5127 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5128 permute statements for the SLP node NODE. Store the number of vector
5129 permute instructions in *N_PERMS and the number of vector load
5130 instructions in *N_LOADS. */
5133 vect_transform_slp_perm_load (vec_info
*vinfo
,
5134 slp_tree node
, vec
<tree
> dr_chain
,
5135 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5136 bool analyze_only
, unsigned *n_perms
,
5137 unsigned int *n_loads
)
5139 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5141 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5142 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5143 unsigned int mask_element
;
5146 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5149 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5151 mode
= TYPE_MODE (vectype
);
5152 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5154 /* Initialize the vect stmts of NODE to properly insert the generated
5157 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5158 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5159 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5161 /* Generate permutation masks for every NODE. Number of masks for each NODE
5162 is equal to GROUP_SIZE.
5163 E.g., we have a group of three nodes with three loads from the same
5164 location in each node, and the vector size is 4. I.e., we have a
5165 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5166 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5167 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5170 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5171 The last mask is illegal since we assume two operands for permute
5172 operation, and the mask element values can't be outside that range.
5173 Hence, the last mask must be converted into {2,5,5,5}.
5174 For the first two permutations we need the first and the second input
5175 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5176 we need the second and the third vectors: {b1,c1,a2,b2} and
5179 int vect_stmts_counter
= 0;
5180 unsigned int index
= 0;
5181 int first_vec_index
= -1;
5182 int second_vec_index
= -1;
5186 vec_perm_builder mask
;
5187 unsigned int nelts_to_build
;
5188 unsigned int nvectors_per_build
;
5189 unsigned int in_nlanes
;
5190 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5191 && multiple_p (nunits
, group_size
));
5194 /* A single vector contains a whole number of copies of the node, so:
5195 (a) all permutes can use the same mask; and
5196 (b) the permutes only need a single vector input. */
5197 mask
.new_vector (nunits
, group_size
, 3);
5198 nelts_to_build
= mask
.encoded_nelts ();
5199 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5200 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5204 /* We need to construct a separate mask for each vector statement. */
5205 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5206 if (!nunits
.is_constant (&const_nunits
)
5207 || !vf
.is_constant (&const_vf
))
5209 mask
.new_vector (const_nunits
, const_nunits
, 1);
5210 nelts_to_build
= const_vf
* group_size
;
5211 nvectors_per_build
= 1;
5212 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5214 auto_sbitmap
used_in_lanes (in_nlanes
);
5215 bitmap_clear (used_in_lanes
);
5217 unsigned int count
= mask
.encoded_nelts ();
5218 mask
.quick_grow (count
);
5219 vec_perm_indices indices
;
5221 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5223 unsigned int iter_num
= j
/ group_size
;
5224 unsigned int stmt_num
= j
% group_size
;
5225 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5226 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5227 bitmap_set_bit (used_in_lanes
, i
);
5230 first_vec_index
= 0;
5235 /* Enforced before the loop when !repeating_p. */
5236 unsigned int const_nunits
= nunits
.to_constant ();
5237 vec_index
= i
/ const_nunits
;
5238 mask_element
= i
% const_nunits
;
5239 if (vec_index
== first_vec_index
5240 || first_vec_index
== -1)
5242 first_vec_index
= vec_index
;
5244 else if (vec_index
== second_vec_index
5245 || second_vec_index
== -1)
5247 second_vec_index
= vec_index
;
5248 mask_element
+= const_nunits
;
5252 if (dump_enabled_p ())
5253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5254 "permutation requires at "
5255 "least three vectors %G",
5257 gcc_assert (analyze_only
);
5261 gcc_assert (mask_element
< 2 * const_nunits
);
5264 if (mask_element
!= index
)
5266 mask
[index
++] = mask_element
;
5268 if (index
== count
&& !noop_p
)
5270 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5271 if (!can_vec_perm_const_p (mode
, indices
))
5273 if (dump_enabled_p ())
5275 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5277 "unsupported vect permute { ");
5278 for (i
= 0; i
< count
; ++i
)
5280 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5281 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5283 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5285 gcc_assert (analyze_only
);
5296 tree mask_vec
= NULL_TREE
;
5299 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5301 if (second_vec_index
== -1)
5302 second_vec_index
= first_vec_index
;
5304 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5306 /* Generate the permute statement if necessary. */
5307 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5308 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5312 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5314 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5316 perm_dest
= make_ssa_name (perm_dest
);
5318 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5319 first_vec
, second_vec
,
5321 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5325 /* If mask was NULL_TREE generate the requested
5326 identity transform. */
5327 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5329 /* Store the vector statement in NODE. */
5330 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5335 first_vec_index
= -1;
5336 second_vec_index
= -1;
5344 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5347 /* Enforced above when !repeating_p. */
5348 unsigned int const_nunits
= nunits
.to_constant ();
5350 bool load_seen
= false;
5351 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5353 if (i
% const_nunits
== 0)
5359 if (bitmap_bit_p (used_in_lanes
, i
))
5371 /* Vectorize the SLP permutations in NODE as specified
5372 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5373 child number and lane number.
5374 Interleaving of two two-lane two-child SLP subtrees (not supported):
5375 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5376 A blend of two four-lane two-child SLP subtrees:
5377 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5378 Highpart of a four-lane one-child SLP subtree (not supported):
5379 [ { 0, 2 }, { 0, 3 } ]
5380 Where currently only a subset is supported by code generating below. */
5383 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5384 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5386 tree vectype
= SLP_TREE_VECTYPE (node
);
5388 /* ??? We currently only support all same vector input and output types
5389 while the SLP IL should really do a concat + select and thus accept
5390 arbitrary mismatches. */
5393 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5394 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5395 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5399 "Unsupported lane permutation\n");
5403 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5404 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5405 if (dump_enabled_p ())
5407 dump_printf_loc (MSG_NOTE
, vect_location
,
5408 "vectorizing permutation");
5409 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5410 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5411 dump_printf (MSG_NOTE
, "\n");
5414 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5415 if (!nunits
.is_constant ())
5417 unsigned HOST_WIDE_INT vf
= 1;
5418 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5419 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5421 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5422 gcc_assert (multiple_p (olanes
, nunits
));
5424 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5425 from the { SLP operand, scalar lane } permutation as recorded in the
5426 SLP node as intermediate step. This part should already work
5427 with SLP children with arbitrary number of lanes. */
5428 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5429 auto_vec
<unsigned> active_lane
;
5430 vperm
.create (olanes
);
5431 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5432 for (unsigned i
= 0; i
< vf
; ++i
)
5434 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5436 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5437 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5438 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5439 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5440 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5441 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5443 /* Advance to the next group. */
5444 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5445 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5448 if (dump_enabled_p ())
5450 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5451 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5453 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5454 dump_printf (MSG_NOTE
, ",");
5455 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5456 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5457 vperm
[i
].first
.second
);
5459 dump_printf (MSG_NOTE
, "\n");
5462 /* We can only handle two-vector permutes, everything else should
5463 be lowered on the SLP level. The following is closely inspired
5464 by vect_transform_slp_perm_load and is supposed to eventually
5466 ??? As intermediate step do code-gen in the SLP tree representation
5468 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5469 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5470 unsigned int const_nunits
= nunits
.to_constant ();
5471 unsigned int index
= 0;
5472 unsigned int mask_element
;
5473 vec_perm_builder mask
;
5474 mask
.new_vector (const_nunits
, const_nunits
, 1);
5475 unsigned int count
= mask
.encoded_nelts ();
5476 mask
.quick_grow (count
);
5477 vec_perm_indices indices
;
5478 unsigned nperms
= 0;
5479 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5481 mask_element
= vperm
[i
].second
;
5482 if (first_vec
.first
== -1U
5483 || first_vec
== vperm
[i
].first
)
5484 first_vec
= vperm
[i
].first
;
5485 else if (second_vec
.first
== -1U
5486 || second_vec
== vperm
[i
].first
)
5488 second_vec
= vperm
[i
].first
;
5489 mask_element
+= const_nunits
;
5493 if (dump_enabled_p ())
5494 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5495 "permutation requires at "
5496 "least three vectors\n");
5501 mask
[index
++] = mask_element
;
5505 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5507 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5509 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5511 if (dump_enabled_p ())
5513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5515 "unsupported vect permute { ");
5516 for (i
= 0; i
< count
; ++i
)
5518 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5519 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5521 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5531 if (second_vec
.first
== -1U)
5532 second_vec
= first_vec
;
5534 /* Generate the permute statement if necessary. */
5535 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5537 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5539 tree perm_dest
= make_ssa_name (vectype
);
5542 slp_tree second_node
5543 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5545 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5546 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5547 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5548 first_def
, second_def
,
5552 /* We need a copy here in case the def was external. */
5553 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5554 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5555 /* Store the vector statement in NODE. */
5556 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5560 first_vec
= std::make_pair (-1U, -1U);
5561 second_vec
= std::make_pair (-1U, -1U);
5566 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5571 /* Vectorize SLP NODE. */
5574 vect_schedule_slp_node (vec_info
*vinfo
,
5575 slp_tree node
, slp_instance instance
)
5577 gimple_stmt_iterator si
;
5581 /* For existing vectors there's nothing to do. */
5582 if (SLP_TREE_VEC_DEFS (node
).exists ())
5585 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5587 /* Vectorize externals and constants. */
5588 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5589 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
5591 /* ??? vectorizable_shift can end up using a scalar operand which is
5592 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5593 node in this case. */
5594 if (!SLP_TREE_VECTYPE (node
))
5597 vect_create_constant_vectors (vinfo
, node
);
5601 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5603 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
5604 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
5606 if (dump_enabled_p ())
5607 dump_printf_loc (MSG_NOTE
, vect_location
,
5608 "------>vectorizing SLP node starting from: %G",
5611 if (STMT_VINFO_DATA_REF (stmt_info
)
5612 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5614 /* Vectorized loads go before the first scalar load to make it
5615 ready early, vectorized stores go before the last scalar
5616 stmt which is where all uses are ready. */
5617 stmt_vec_info last_stmt_info
= NULL
;
5618 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
5619 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
5620 else /* DR_IS_WRITE */
5621 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
5622 si
= gsi_for_stmt (last_stmt_info
->stmt
);
5624 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
5625 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
5626 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
5627 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5629 /* For PHI node vectorization we do not use the insertion iterator. */
5634 /* Emit other stmts after the children vectorized defs which is
5635 earliest possible. */
5636 gimple
*last_stmt
= NULL
;
5637 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5638 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5640 /* For fold-left reductions we are retaining the scalar
5641 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5642 set so the representation isn't perfect. Resort to the
5643 last scalar def here. */
5644 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
5646 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
5647 == cycle_phi_info_type
);
5648 gphi
*phi
= as_a
<gphi
*>
5649 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
5651 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
5654 /* We are emitting all vectorized stmts in the same place and
5655 the last one is the last.
5656 ??? Unless we have a load permutation applied and that
5657 figures to re-use an earlier generated load. */
5660 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
5662 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5665 else if (!SLP_TREE_VECTYPE (child
))
5667 /* For externals we use unvectorized at all scalar defs. */
5670 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
5671 if (TREE_CODE (def
) == SSA_NAME
5672 && !SSA_NAME_IS_DEFAULT_DEF (def
))
5674 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
5676 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
5682 /* For externals we have to look at all defs since their
5683 insertion place is decided per vector. But beware
5684 of pre-existing vectors where we need to make sure
5685 we do not insert before the region boundary. */
5686 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
5687 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
5688 last_stmt
= gsi_stmt (gsi_after_labels
5689 (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]));
5694 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
5695 if (TREE_CODE (vdef
) == SSA_NAME
5696 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
5698 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
5700 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5705 /* This can happen when all children are pre-existing vectors or
5708 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
5709 if (is_a
<gphi
*> (last_stmt
))
5710 si
= gsi_after_labels (gimple_bb (last_stmt
));
5713 si
= gsi_for_stmt (last_stmt
);
5718 bool done_p
= false;
5720 /* Handle purely internal nodes. */
5721 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5723 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5724 be shared with different SLP nodes (but usually it's the same
5725 operation apart from the case the stmt is only there for denoting
5726 the actual scalar lane defs ...). So do not call vect_transform_stmt
5727 but open-code it here (partly). */
5728 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
5733 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
5736 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5737 For loop vectorization this is done in vectorizable_call, but for SLP
5738 it needs to be deferred until end of vect_schedule_slp, because multiple
5739 SLP instances may refer to the same scalar stmt. */
5742 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
5743 slp_tree node
, hash_set
<slp_tree
> &visited
)
5746 gimple_stmt_iterator gsi
;
5750 stmt_vec_info stmt_info
;
5752 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5755 if (visited
.add (node
))
5758 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5759 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
5761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5763 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
5764 if (!stmt
|| gimple_bb (stmt
) == NULL
)
5766 if (is_pattern_stmt_p (stmt_info
)
5767 || !PURE_SLP_STMT (stmt_info
))
5769 lhs
= gimple_call_lhs (stmt
);
5770 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
5771 gsi
= gsi_for_stmt (stmt
);
5772 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
5773 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
5778 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
5780 hash_set
<slp_tree
> visited
;
5781 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
5784 /* Vectorize the instance root. */
5787 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
5789 gassign
*rstmt
= NULL
;
5791 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
5796 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5798 tree vect_lhs
= gimple_get_lhs (child_stmt
);
5799 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5800 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
5801 TREE_TYPE (vect_lhs
)))
5802 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
5804 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
5808 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
5810 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5813 vec
<constructor_elt
, va_gc
> *v
;
5814 vec_alloc (v
, nelts
);
5816 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5818 CONSTRUCTOR_APPEND_ELT (v
,
5820 gimple_get_lhs (child_stmt
));
5822 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5823 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
5824 tree r_constructor
= build_constructor (rtype
, v
);
5825 rstmt
= gimple_build_assign (lhs
, r_constructor
);
5830 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
5831 gsi_replace (&rgsi
, rstmt
, true);
5841 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5844 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
5845 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
5846 int &maxdfs
, vec
<slp_tree
> &stack
)
5849 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
5850 gcc_assert (!existed_p
);
5852 info
->lowlink
= maxdfs
;
5856 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5858 info
->on_stack
= false;
5859 vect_schedule_slp_node (vinfo
, node
, instance
);
5863 info
->on_stack
= true;
5864 stack
.safe_push (node
);
5869 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5873 slp_scc_info
*child_info
= scc_info
.get (child
);
5876 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
5877 /* Recursion might have re-allocated the node. */
5878 info
= scc_info
.get (node
);
5879 child_info
= scc_info
.get (child
);
5880 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
5882 else if (child_info
->on_stack
)
5883 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
5885 if (info
->lowlink
!= info
->dfs
)
5888 auto_vec
<slp_tree
, 4> phis_to_fixup
;
5891 if (stack
.last () == node
)
5894 info
->on_stack
= false;
5895 vect_schedule_slp_node (vinfo
, node
, instance
);
5896 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
5897 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
5898 phis_to_fixup
.quick_push (node
);
5903 int last_idx
= stack
.length () - 1;
5904 while (stack
[last_idx
] != node
)
5906 /* We can break the cycle at PHIs who have at least one child
5907 code generated. Then we could re-start the DFS walk until
5908 all nodes in the SCC are covered (we might have new entries
5909 for only back-reachable nodes). But it's simpler to just
5910 iterate and schedule those that are ready. */
5911 unsigned todo
= stack
.length () - last_idx
;
5914 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
5916 slp_tree entry
= stack
[idx
];
5919 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
5920 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
5922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
5929 else if (scc_info
.get (child
)->on_stack
)
5947 vect_schedule_slp_node (vinfo
, entry
, instance
);
5948 scc_info
.get (entry
)->on_stack
= false;
5952 phis_to_fixup
.safe_push (entry
);
5959 stack
.truncate (last_idx
);
5962 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
5964 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
5966 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
5969 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
5971 unsigned dest_idx
= e
->dest_idx
;
5972 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
5973 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
5975 /* Simply fill all args. */
5976 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
5977 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
5978 vect_get_slp_vect_def (child
, i
),
5979 e
, gimple_phi_arg_location (phi
, dest_idx
));
5984 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
5987 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
5989 slp_instance instance
;
5992 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
5994 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
5996 slp_tree node
= SLP_INSTANCE_TREE (instance
);
5997 if (dump_enabled_p ())
5999 dump_printf_loc (MSG_NOTE
, vect_location
,
6000 "Vectorizing SLP tree:\n");
6001 if (SLP_INSTANCE_ROOT_STMT (instance
))
6002 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6003 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6004 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6005 SLP_INSTANCE_TREE (instance
));
6007 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6008 have a PHI be the node breaking the cycle. */
6009 auto_vec
<slp_tree
> stack
;
6010 if (!scc_info
.get (node
))
6011 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6013 if (SLP_INSTANCE_ROOT_STMT (instance
))
6014 vectorize_slp_instance_root_stmt (node
, instance
);
6016 if (dump_enabled_p ())
6017 dump_printf_loc (MSG_NOTE
, vect_location
,
6018 "vectorizing stmts using SLP.\n");
6021 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6023 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6024 stmt_vec_info store_info
;
6027 /* Remove scalar call stmts. Do not do this for basic-block
6028 vectorization as not all uses may be vectorized.
6029 ??? Why should this be necessary? DCE should be able to
6030 remove the stmts itself.
6031 ??? For BB vectorization we can as well remove scalar
6032 stmts starting from the SLP tree root if they have no
6034 if (is_a
<loop_vec_info
> (vinfo
))
6035 vect_remove_slp_scalar_calls (vinfo
, root
);
6037 /* Remove vectorized stores original scalar stmts. */
6038 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6040 if (!STMT_VINFO_DATA_REF (store_info
)
6041 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6044 store_info
= vect_orig_stmt (store_info
);
6045 /* Free the attached stmt_vec_info and remove the stmt. */
6046 vinfo
->remove_stmt (store_info
);