1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
51 slp_tree
, stmt_vector_for_cost
*);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
58 SLP_TREE_SCALAR_OPS (this) = vNULL
;
59 SLP_TREE_VEC_STMTS (this) = vNULL
;
60 SLP_TREE_VEC_DEFS (this) = vNULL
;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL
;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
66 SLP_TREE_CODE (this) = ERROR_MARK
;
67 SLP_TREE_VECTYPE (this) = NULL_TREE
;
68 SLP_TREE_REPRESENTATIVE (this) = NULL
;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
90 vect_free_slp_tree (slp_tree node
)
95 if (--node
->refcnt
!= 0)
98 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
99 vect_free_slp_tree (child
);
104 /* Return a location suitable for dumpings related to the SLP instance. */
107 _slp_instance::location () const
110 return root_stmt
->stmt
;
112 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
116 /* Free the memory allocated for the SLP instance. */
119 vect_free_slp_instance (slp_instance instance
)
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
122 SLP_INSTANCE_LOADS (instance
).release ();
123 instance
->subgraph_entries
.release ();
124 instance
->cost_vec
.release ();
129 /* Create an SLP node for SCALAR_STMTS. */
132 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
134 slp_tree node
= new _slp_tree
;
135 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
136 SLP_TREE_CHILDREN (node
).create (nops
);
137 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
138 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
139 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
143 /* Create an SLP node for OPS. */
146 vect_create_new_slp_node (vec
<tree
> ops
)
148 slp_tree node
= new _slp_tree
;
149 SLP_TREE_SCALAR_OPS (node
) = ops
;
150 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
151 SLP_TREE_LANES (node
) = ops
.length ();
156 /* This structure is used in creation of an SLP tree. Each instance
157 corresponds to the same operand in a group of scalar stmts in an SLP
159 typedef struct _slp_oprnd_info
161 /* Def-stmts for the operands. */
162 vec
<stmt_vec_info
> def_stmts
;
165 /* Information about the first statement, its vector def-type, type, the
166 operand itself in case it's constant, and an indication if it's a pattern
169 enum vect_def_type first_dt
;
174 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
176 static vec
<slp_oprnd_info
>
177 vect_create_oprnd_info (int nops
, int group_size
)
180 slp_oprnd_info oprnd_info
;
181 vec
<slp_oprnd_info
> oprnds_info
;
183 oprnds_info
.create (nops
);
184 for (i
= 0; i
< nops
; i
++)
186 oprnd_info
= XNEW (struct _slp_oprnd_info
);
187 oprnd_info
->def_stmts
.create (group_size
);
188 oprnd_info
->ops
.create (group_size
);
189 oprnd_info
->first_dt
= vect_uninitialized_def
;
190 oprnd_info
->first_op_type
= NULL_TREE
;
191 oprnd_info
->any_pattern
= false;
192 oprnds_info
.quick_push (oprnd_info
);
199 /* Free operands info. */
202 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
205 slp_oprnd_info oprnd_info
;
207 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
209 oprnd_info
->def_stmts
.release ();
210 oprnd_info
->ops
.release ();
211 XDELETE (oprnd_info
);
214 oprnds_info
.release ();
218 /* Return true if STMTS contains a pattern statement. */
221 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
223 stmt_vec_info stmt_info
;
225 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
226 if (is_pattern_stmt_p (stmt_info
))
231 /* Return true when all lanes in the external or constant NODE have
235 vect_slp_tree_uniform_p (slp_tree node
)
237 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
238 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
240 /* Pre-exsting vectors. */
241 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
245 tree op
, first
= NULL_TREE
;
246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
249 else if (!operand_equal_p (first
, op
, 0))
255 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
256 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
260 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
261 stmt_vec_info first_stmt_info
)
263 stmt_vec_info next_stmt_info
= first_stmt_info
;
266 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
271 if (next_stmt_info
== stmt_info
)
273 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
275 result
+= DR_GROUP_GAP (next_stmt_info
);
277 while (next_stmt_info
);
282 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
283 using the method implemented by duplicate_and_interleave. Return true
284 if so, returning the number of intermediate vectors in *NVECTORS_OUT
285 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
289 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
290 tree elt_type
, unsigned int *nvectors_out
,
291 tree
*vector_type_out
,
294 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
295 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
298 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
299 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
300 unsigned int nvectors
= 1;
303 scalar_int_mode int_mode
;
304 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
305 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
307 /* Get the natural vector type for this SLP group size. */
308 tree int_type
= build_nonstandard_integer_type
309 (GET_MODE_BITSIZE (int_mode
), 1);
311 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
313 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
314 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
315 GET_MODE_SIZE (base_vector_mode
)))
317 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
318 together into elements of type INT_TYPE and using the result
319 to build NVECTORS vectors. */
320 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
321 vec_perm_builder
sel1 (nelts
, 2, 3);
322 vec_perm_builder
sel2 (nelts
, 2, 3);
323 poly_int64 half_nelts
= exact_div (nelts
, 2);
324 for (unsigned int i
= 0; i
< 3; ++i
)
327 sel1
.quick_push (i
+ nelts
);
328 sel2
.quick_push (half_nelts
+ i
);
329 sel2
.quick_push (half_nelts
+ i
+ nelts
);
331 vec_perm_indices
indices1 (sel1
, 2, nelts
);
332 vec_perm_indices
indices2 (sel2
, 2, nelts
);
333 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
334 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
337 *nvectors_out
= nvectors
;
339 *vector_type_out
= vector_type
;
342 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
344 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
351 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
357 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
358 they are of a valid type and that they match the defs of the first stmt of
359 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
360 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
361 indicates swap is required for cond_expr stmts. Specifically, *SWAP
362 is 1 if STMT is cond and operands of comparison need to be swapped;
363 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
364 If there is any operand swap in this function, *SWAP is set to non-zero
366 If there was a fatal error return -1; if the error could be corrected by
367 swapping operands of father node of this one, return 1; if everything is
370 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
371 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
372 vec
<slp_oprnd_info
> *oprnds_info
)
374 stmt_vec_info stmt_info
= stmts
[stmt_num
];
376 unsigned int i
, number_of_oprnds
;
377 enum vect_def_type dt
= vect_uninitialized_def
;
378 slp_oprnd_info oprnd_info
;
379 int first_op_idx
= 1;
380 unsigned int commutative_op
= -1U;
381 bool first_op_cond
= false;
382 bool first
= stmt_num
== 0;
384 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
386 number_of_oprnds
= gimple_call_num_args (stmt
);
388 if (gimple_call_internal_p (stmt
))
390 internal_fn ifn
= gimple_call_internal_fn (stmt
);
391 commutative_op
= first_commutative_argument (ifn
);
393 /* Masked load, only look at mask. */
394 if (ifn
== IFN_MASK_LOAD
)
396 number_of_oprnds
= 1;
397 /* Mask operand index. */
402 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
404 enum tree_code code
= gimple_assign_rhs_code (stmt
);
405 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
406 /* Swap can only be done for cond_expr if asked to, otherwise we
407 could result in different comparison code to the first stmt. */
408 if (code
== COND_EXPR
409 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
411 first_op_cond
= true;
415 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
420 bool swapped
= (swap
!= 0);
421 gcc_assert (!swapped
|| first_op_cond
);
422 for (i
= 0; i
< number_of_oprnds
; i
++)
427 /* Map indicating how operands of cond_expr should be swapped. */
428 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
429 int *map
= maps
[swap
];
432 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
433 first_op_idx
), map
[i
]);
435 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
438 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
439 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
440 oprnd
= TREE_OPERAND (oprnd
, 0);
442 oprnd_info
= (*oprnds_info
)[i
];
444 stmt_vec_info def_stmt_info
;
445 if (!vect_is_simple_use (oprnd
, vinfo
, &dt
, &def_stmt_info
))
447 if (dump_enabled_p ())
448 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
449 "Build SLP failed: can't analyze def for %T\n",
455 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
456 oprnd_info
->any_pattern
= true;
460 /* For the swapping logic below force vect_reduction_def
461 for the reduction op in a SLP reduction group. */
462 if (!STMT_VINFO_DATA_REF (stmt_info
)
463 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
464 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
466 dt
= vect_reduction_def
;
467 oprnd_info
->first_dt
= dt
;
468 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
472 /* Not first stmt of the group, check that the def-stmt/s match
473 the def-stmt/s of the first stmt. Allow different definition
474 types for reduction chains: the first stmt must be a
475 vect_reduction_def (a phi node), and the rest
476 end in the reduction chain. */
477 tree type
= TREE_TYPE (oprnd
);
478 if ((oprnd_info
->first_dt
!= dt
479 && !(oprnd_info
->first_dt
== vect_reduction_def
480 && !STMT_VINFO_DATA_REF (stmt_info
)
481 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
483 && !STMT_VINFO_DATA_REF (def_stmt_info
)
484 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
485 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
486 && !((oprnd_info
->first_dt
== vect_external_def
487 || oprnd_info
->first_dt
== vect_constant_def
)
488 && (dt
== vect_external_def
489 || dt
== vect_constant_def
)))
490 || !types_compatible_p (oprnd_info
->first_op_type
, type
)
491 || (!STMT_VINFO_DATA_REF (stmt_info
)
492 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
494 || STMT_VINFO_DATA_REF (def_stmt_info
)
495 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
496 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
497 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
499 /* Try swapping operands if we got a mismatch. */
500 if (i
== commutative_op
&& !swapped
)
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE
, vect_location
,
504 "trying swapped operands\n");
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
511 "Build SLP failed: different types\n");
515 if ((dt
== vect_constant_def
516 || dt
== vect_external_def
)
517 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
518 && (TREE_CODE (type
) == BOOLEAN_TYPE
519 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
524 "Build SLP failed: invalid type of def "
525 "for variable-length SLP %T\n", oprnd
);
530 /* Check the types of the definitions. */
533 case vect_external_def
:
534 /* Make sure to demote the overall operand to external. */
535 oprnd_info
->first_dt
= vect_external_def
;
537 case vect_constant_def
:
538 oprnd_info
->def_stmts
.quick_push (NULL
);
539 oprnd_info
->ops
.quick_push (oprnd
);
542 case vect_internal_def
:
543 case vect_reduction_def
:
544 if (oprnd_info
->first_dt
== vect_reduction_def
545 && !STMT_VINFO_DATA_REF (stmt_info
)
546 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
547 && !STMT_VINFO_DATA_REF (def_stmt_info
)
548 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
549 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
551 /* For a SLP reduction chain we want to duplicate the
552 reduction to each of the chain members. That gets
553 us a sane SLP graph (still the stmts are not 100%
554 correct wrt the initial values). */
556 oprnd_info
->def_stmts
.quick_push (oprnd_info
->def_stmts
[0]);
557 oprnd_info
->ops
.quick_push (oprnd_info
->ops
[0]);
561 case vect_induction_def
:
562 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
563 oprnd_info
->ops
.quick_push (oprnd
);
567 /* FORNOW: Not supported. */
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
570 "Build SLP failed: illegal type of def %T\n",
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE
, vect_location
,
582 "swapped operands to match def types in %G",
589 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
590 Return true if we can, meaning that this choice doesn't conflict with
591 existing SLP nodes that use STMT_INFO. */
594 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
596 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
598 return useless_type_conversion_p (vectype
, old_vectype
);
600 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
602 /* We maintain the invariant that if any statement in the group is
603 used, all other members of the group have the same vector type. */
604 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
605 stmt_vec_info member_info
= first_info
;
606 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
607 if (is_pattern_stmt_p (member_info
)
608 && !useless_type_conversion_p (vectype
,
609 STMT_VINFO_VECTYPE (member_info
)))
614 for (member_info
= first_info
; member_info
;
615 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
616 STMT_VINFO_VECTYPE (member_info
) = vectype
;
620 else if (!is_pattern_stmt_p (stmt_info
))
622 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
626 if (dump_enabled_p ())
628 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
629 "Build SLP failed: incompatible vector"
630 " types for: %G", stmt_info
->stmt
);
631 dump_printf_loc (MSG_NOTE
, vect_location
,
632 " old vector type: %T\n", old_vectype
);
633 dump_printf_loc (MSG_NOTE
, vect_location
,
634 " new vector type: %T\n", vectype
);
639 /* Return true if call statements CALL1 and CALL2 are similar enough
640 to be combined into the same SLP group. */
643 compatible_calls_p (gcall
*call1
, gcall
*call2
)
645 unsigned int nargs
= gimple_call_num_args (call1
);
646 if (nargs
!= gimple_call_num_args (call2
))
649 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
652 if (gimple_call_internal_p (call1
))
654 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
655 TREE_TYPE (gimple_call_lhs (call2
))))
657 for (unsigned int i
= 0; i
< nargs
; ++i
)
658 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
659 TREE_TYPE (gimple_call_arg (call2
, i
))))
664 if (!operand_equal_p (gimple_call_fn (call1
),
665 gimple_call_fn (call2
), 0))
668 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
674 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
675 caller's attempt to find the vector type in STMT_INFO with the narrowest
676 element type. Return true if VECTYPE is nonnull and if it is valid
677 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
678 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
679 vect_build_slp_tree. */
682 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
683 unsigned int group_size
,
684 tree vectype
, poly_uint64
*max_nunits
)
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
690 "Build SLP failed: unsupported data-type in %G\n",
692 /* Fatal mismatch. */
696 /* If populating the vector type requires unrolling then fail
697 before adjusting *max_nunits for basic-block vectorization. */
698 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
699 unsigned HOST_WIDE_INT const_nunits
;
700 if (is_a
<bb_vec_info
> (vinfo
)
701 && (!nunits
.is_constant (&const_nunits
)
702 || const_nunits
> group_size
))
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
706 "Build SLP failed: unrolling required "
707 "in basic block SLP\n");
708 /* Fatal mismatch. */
712 /* In case of multiple types we need to detect the smallest type. */
713 vect_update_max_nunits (max_nunits
, vectype
);
717 /* Verify if the scalar stmts STMTS are isomorphic, require data
718 permutation or are of unsupported types of operation. Return
719 true if they are, otherwise return false and indicate in *MATCHES
720 which stmts are not isomorphic to the first one. If MATCHES[0]
721 is false then this indicates the comparison could not be
722 carried out or the stmts will never be vectorized by SLP.
724 Note COND_EXPR is possibly isomorphic to another one after swapping its
725 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
726 the first stmt by swapping the two operands of comparison; set SWAP[i]
727 to 2 if stmt I is isormorphic to the first stmt by inverting the code
728 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
729 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
732 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
733 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
734 poly_uint64
*max_nunits
, bool *matches
,
735 bool *two_operators
, tree
*node_vectype
)
738 stmt_vec_info first_stmt_info
= stmts
[0];
739 enum tree_code first_stmt_code
= ERROR_MARK
;
740 enum tree_code alt_stmt_code
= ERROR_MARK
;
741 enum tree_code rhs_code
= ERROR_MARK
;
742 enum tree_code first_cond_code
= ERROR_MARK
;
744 bool need_same_oprnds
= false;
745 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
748 machine_mode optab_op2_mode
;
749 machine_mode vec_mode
;
750 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
751 bool first_stmt_load_p
= false, load_p
= false;
753 /* For every stmt in NODE find its def stmt/s. */
754 stmt_vec_info stmt_info
;
755 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
757 gimple
*stmt
= stmt_info
->stmt
;
761 if (dump_enabled_p ())
762 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
764 /* Fail to vectorize statements marked as unvectorizable. */
765 if (!STMT_VINFO_VECTORIZABLE (stmt_info
))
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
769 "Build SLP failed: unvectorizable statement %G",
771 /* Fatal mismatch. */
776 lhs
= gimple_get_lhs (stmt
);
777 if (lhs
== NULL_TREE
)
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
781 "Build SLP failed: not GIMPLE_ASSIGN nor "
782 "GIMPLE_CALL %G", stmt
);
783 /* Fatal mismatch. */
789 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
790 &nunits_vectype
, group_size
)
792 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
793 nunits_vectype
, max_nunits
)))
795 /* Fatal mismatch. */
800 gcc_assert (vectype
);
802 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
805 rhs_code
= CALL_EXPR
;
807 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
809 else if ((gimple_call_internal_p (call_stmt
)
810 && (!vectorizable_internal_fn_p
811 (gimple_call_internal_fn (call_stmt
))))
812 || gimple_call_tail_p (call_stmt
)
813 || gimple_call_noreturn_p (call_stmt
)
814 || !gimple_call_nothrow_p (call_stmt
)
815 || gimple_call_chain (call_stmt
))
817 if (dump_enabled_p ())
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
819 "Build SLP failed: unsupported call type %G",
821 /* Fatal mismatch. */
828 rhs_code
= gimple_assign_rhs_code (stmt
);
829 load_p
= gimple_vuse (stmt
);
832 /* Check the operation. */
835 *node_vectype
= vectype
;
836 first_stmt_code
= rhs_code
;
837 first_stmt_load_p
= load_p
;
839 /* Shift arguments should be equal in all the packed stmts for a
840 vector shift with scalar shift operand. */
841 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
842 || rhs_code
== LROTATE_EXPR
843 || rhs_code
== RROTATE_EXPR
)
845 vec_mode
= TYPE_MODE (vectype
);
847 /* First see if we have a vector/vector shift. */
848 optab
= optab_for_tree_code (rhs_code
, vectype
,
852 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
854 /* No vector/vector shift, try for a vector/scalar shift. */
855 optab
= optab_for_tree_code (rhs_code
, vectype
,
860 if (dump_enabled_p ())
861 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
862 "Build SLP failed: no optab.\n");
863 /* Fatal mismatch. */
867 icode
= (int) optab_handler (optab
, vec_mode
);
868 if (icode
== CODE_FOR_nothing
)
870 if (dump_enabled_p ())
871 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
873 "op not supported by target.\n");
874 /* Fatal mismatch. */
878 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
879 if (!VECTOR_MODE_P (optab_op2_mode
))
881 need_same_oprnds
= true;
882 first_op1
= gimple_assign_rhs2 (stmt
);
886 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
888 need_same_oprnds
= true;
889 first_op1
= gimple_assign_rhs2 (stmt
);
892 && rhs_code
== BIT_FIELD_REF
)
894 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
895 if (TREE_CODE (vec
) != SSA_NAME
896 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
901 "BIT_FIELD_REF not supported\n");
902 /* Fatal mismatch. */
908 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
910 need_same_oprnds
= true;
911 first_op1
= gimple_call_arg (call_stmt
, 1);
916 if (first_stmt_code
!= rhs_code
917 && alt_stmt_code
== ERROR_MARK
)
918 alt_stmt_code
= rhs_code
;
919 if ((first_stmt_code
!= rhs_code
920 && (first_stmt_code
!= IMAGPART_EXPR
921 || rhs_code
!= REALPART_EXPR
)
922 && (first_stmt_code
!= REALPART_EXPR
923 || rhs_code
!= IMAGPART_EXPR
)
924 /* Handle mismatches in plus/minus by computing both
925 and merging the results. */
926 && !((first_stmt_code
== PLUS_EXPR
927 || first_stmt_code
== MINUS_EXPR
)
928 && (alt_stmt_code
== PLUS_EXPR
929 || alt_stmt_code
== MINUS_EXPR
)
930 && rhs_code
== alt_stmt_code
)
931 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
932 && (first_stmt_code
== ARRAY_REF
933 || first_stmt_code
== BIT_FIELD_REF
934 || first_stmt_code
== INDIRECT_REF
935 || first_stmt_code
== COMPONENT_REF
936 || first_stmt_code
== MEM_REF
)))
937 || first_stmt_load_p
!= load_p
)
939 if (dump_enabled_p ())
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
942 "Build SLP failed: different operation "
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
945 "original stmt %G", first_stmt_info
->stmt
);
951 if (need_same_oprnds
)
953 tree other_op1
= (call_stmt
954 ? gimple_call_arg (call_stmt
, 1)
955 : gimple_assign_rhs2 (stmt
));
956 if (!operand_equal_p (first_op1
, other_op1
, 0))
958 if (dump_enabled_p ())
959 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
960 "Build SLP failed: different shift "
961 "arguments in %G", stmt
);
967 && first_stmt_code
== BIT_FIELD_REF
968 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
969 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
971 if (dump_enabled_p ())
972 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
973 "Build SLP failed: different BIT_FIELD_REF "
974 "arguments in %G", stmt
);
979 if (!load_p
&& rhs_code
== CALL_EXPR
)
981 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
982 as_a
<gcall
*> (stmt
)))
984 if (dump_enabled_p ())
985 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
986 "Build SLP failed: different calls in %G",
993 if (!types_compatible_p (vectype
, *node_vectype
))
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
997 "Build SLP failed: different vector type "
1004 /* Grouped store or load. */
1005 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1007 if (REFERENCE_CLASS_P (lhs
))
1015 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1016 if (prev_first_load
)
1018 /* Check that there are no loads from different interleaving
1019 chains in the same node. */
1020 if (prev_first_load
!= first_load
)
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1025 "Build SLP failed: different "
1026 "interleaving chains in one node %G",
1033 prev_first_load
= first_load
;
1035 } /* Grouped access. */
1040 /* Not grouped load. */
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1043 "Build SLP failed: not grouped load %G", stmt
);
1045 /* FORNOW: Not grouped loads are not supported. */
1046 /* Fatal mismatch. */
1051 /* Not memory operation. */
1052 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
1053 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1054 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1055 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1056 && rhs_code
!= VIEW_CONVERT_EXPR
1057 && rhs_code
!= CALL_EXPR
1058 && rhs_code
!= BIT_FIELD_REF
)
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1062 "Build SLP failed: operation unsupported %G",
1064 /* Fatal mismatch. */
1069 if (rhs_code
== COND_EXPR
)
1071 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1072 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1073 enum tree_code swap_code
= ERROR_MARK
;
1074 enum tree_code invert_code
= ERROR_MARK
;
1077 first_cond_code
= TREE_CODE (cond_expr
);
1078 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1080 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1081 swap_code
= swap_tree_comparison (cond_code
);
1082 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1085 if (first_cond_code
== cond_code
)
1087 /* Isomorphic can be achieved by swapping. */
1088 else if (first_cond_code
== swap_code
)
1090 /* Isomorphic can be achieved by inverting. */
1091 else if (first_cond_code
== invert_code
)
1095 if (dump_enabled_p ())
1096 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1097 "Build SLP failed: different"
1098 " operation %G", stmt
);
1108 for (i
= 0; i
< group_size
; ++i
)
1112 /* If we allowed a two-operation SLP node verify the target can cope
1113 with the permute we are going to use. */
1114 if (alt_stmt_code
!= ERROR_MARK
1115 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1117 *two_operators
= true;
1123 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1124 Note we never remove apart from at destruction time so we do not
1125 need a special value for deleted that differs from empty. */
1128 typedef vec
<stmt_vec_info
> value_type
;
1129 typedef vec
<stmt_vec_info
> compare_type
;
1130 static inline hashval_t
hash (value_type
);
1131 static inline bool equal (value_type existing
, value_type candidate
);
1132 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1133 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1134 static const bool empty_zero_p
= true;
1135 static inline void mark_empty (value_type
&x
) { x
.release (); }
1136 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1137 static inline void remove (value_type
&x
) { x
.release (); }
1140 bst_traits::hash (value_type x
)
1143 for (unsigned i
= 0; i
< x
.length (); ++i
)
1144 h
.add_int (gimple_uid (x
[i
]->stmt
));
1148 bst_traits::equal (value_type existing
, value_type candidate
)
1150 if (existing
.length () != candidate
.length ())
1152 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1153 if (existing
[i
] != candidate
[i
])
1158 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1159 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1160 scalar_stmts_to_slp_tree_map_t
;
1163 vect_build_slp_tree_2 (vec_info
*vinfo
,
1164 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1165 poly_uint64
*max_nunits
,
1166 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1167 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1170 vect_build_slp_tree (vec_info
*vinfo
,
1171 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1172 poly_uint64
*max_nunits
,
1173 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1174 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1176 if (slp_tree
*leader
= bst_map
->get (stmts
))
1178 if (dump_enabled_p ())
1179 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1180 *leader
? "" : "failed ", *leader
);
1183 (*leader
)->refcnt
++;
1184 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1188 poly_uint64 this_max_nunits
= 1;
1189 slp_tree res
= vect_build_slp_tree_2 (vinfo
, stmts
, group_size
,
1191 matches
, npermutes
, tree_size
, bst_map
);
1194 res
->max_nunits
= this_max_nunits
;
1195 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1196 /* Keep a reference for the bst_map use. */
1199 bst_map
->put (stmts
.copy (), res
);
1203 /* Recursively build an SLP tree starting from NODE.
1204 Fail (and return a value not equal to zero) if def-stmts are not
1205 isomorphic, require data permutation or are of unsupported types of
1206 operation. Otherwise, return 0.
1207 The value returned is the depth in the SLP tree where a mismatch
1211 vect_build_slp_tree_2 (vec_info
*vinfo
,
1212 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1213 poly_uint64
*max_nunits
,
1214 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1215 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1217 unsigned nops
, i
, this_tree_size
= 0;
1218 poly_uint64 this_max_nunits
= *max_nunits
;
1223 stmt_vec_info stmt_info
= stmts
[0];
1224 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1225 nops
= gimple_call_num_args (stmt
);
1226 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1228 nops
= gimple_num_ops (stmt
) - 1;
1229 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1232 else if (is_a
<gphi
*> (stmt_info
->stmt
))
1237 /* If the SLP node is a PHI (induction or reduction), terminate
1239 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1241 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1242 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
1243 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1247 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1248 /* Induction from different IVs is not supported. */
1249 if (def_type
== vect_induction_def
)
1251 stmt_vec_info other_info
;
1252 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1253 if (stmt_info
!= other_info
)
1256 else if (def_type
== vect_reduction_def
1257 || def_type
== vect_double_reduction_def
1258 || def_type
== vect_nested_cycle
)
1260 /* Else def types have to match. */
1261 stmt_vec_info other_info
;
1262 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1263 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1269 node
= vect_create_new_slp_node (stmts
, 0);
1270 SLP_TREE_VECTYPE (node
) = vectype
;
1275 bool two_operators
= false;
1276 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1277 tree vectype
= NULL_TREE
;
1278 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1279 &this_max_nunits
, matches
, &two_operators
,
1283 /* If the SLP node is a load, terminate the recursion unless masked. */
1284 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1285 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1287 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1290 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1295 *max_nunits
= this_max_nunits
;
1297 node
= vect_create_new_slp_node (stmts
, 0);
1298 SLP_TREE_VECTYPE (node
) = vectype
;
1299 /* And compute the load permutation. Whether it is actually
1300 a permutation depends on the unrolling factor which is
1302 vec
<unsigned> load_permutation
;
1304 stmt_vec_info load_info
;
1305 load_permutation
.create (group_size
);
1306 stmt_vec_info first_stmt_info
1307 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1308 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1310 int load_place
= vect_get_place_in_interleaving_chain
1311 (load_info
, first_stmt_info
);
1312 gcc_assert (load_place
!= -1);
1313 load_permutation
.safe_push (load_place
);
1315 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1319 else if (gimple_assign_single_p (stmt_info
->stmt
)
1320 && !gimple_vuse (stmt_info
->stmt
)
1321 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1323 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1324 the same SSA name vector of a compatible type to vectype. */
1325 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1326 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1327 stmt_vec_info estmt_info
;
1328 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1330 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1331 tree bfref
= gimple_assign_rhs1 (estmt
);
1333 if (!known_eq (bit_field_size (bfref
),
1334 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1335 || !constant_multiple_p (bit_field_offset (bfref
),
1336 bit_field_size (bfref
), &lane
))
1341 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1343 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1344 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1345 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1346 /* We are always building a permutation node even if it is an identity
1347 permute to shield the rest of the vectorizer from the odd node
1348 representing an actual vector without any scalar ops.
1349 ??? We could hide it completely with making the permute node
1351 node
= vect_create_new_slp_node (stmts
, 1);
1352 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1353 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1354 SLP_TREE_VECTYPE (node
) = vectype
;
1355 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1359 /* Get at the operands, verifying they are compatible. */
1360 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1361 slp_oprnd_info oprnd_info
;
1362 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1364 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
],
1365 stmts
, i
, &oprnds_info
);
1367 matches
[(res
== -1) ? 0 : i
] = false;
1371 for (i
= 0; i
< group_size
; ++i
)
1374 vect_free_oprnd_info (oprnds_info
);
1379 auto_vec
<slp_tree
, 4> children
;
1381 stmt_info
= stmts
[0];
1383 /* Create SLP_TREE nodes for the definition node/s. */
1384 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1389 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1391 /* COND_EXPR have one too many eventually if the condition
1393 gcc_assert (i
== 3 && nops
== 4);
1397 if (oprnd_info
->first_dt
!= vect_internal_def
1398 && oprnd_info
->first_dt
!= vect_reduction_def
1399 && oprnd_info
->first_dt
!= vect_induction_def
)
1401 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1402 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1403 oprnd_info
->ops
= vNULL
;
1404 children
.safe_push (invnode
);
1408 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1409 group_size
, &this_max_nunits
,
1411 &this_tree_size
, bst_map
)) != NULL
)
1413 oprnd_info
->def_stmts
= vNULL
;
1414 children
.safe_push (child
);
1418 /* If the SLP build for operand zero failed and operand zero
1419 and one can be commutated try that for the scalar stmts
1420 that failed the match. */
1422 /* A first scalar stmt mismatch signals a fatal mismatch. */
1424 /* ??? For COND_EXPRs we can swap the comparison operands
1425 as well as the arms under some constraints. */
1427 && oprnds_info
[1]->first_dt
== vect_internal_def
1428 && is_gimple_assign (stmt_info
->stmt
)
1429 /* Swapping operands for reductions breaks assumptions later on. */
1430 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1431 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1432 /* Do so only if the number of not successful permutes was nor more
1433 than a cut-ff as re-trying the recursive match on
1434 possibly each level of the tree would expose exponential
1438 /* See whether we can swap the matching or the non-matching
1440 bool swap_not_matching
= true;
1443 for (j
= 0; j
< group_size
; ++j
)
1445 if (matches
[j
] != !swap_not_matching
)
1447 stmt_vec_info stmt_info
= stmts
[j
];
1448 /* Verify if we can swap operands of this stmt. */
1449 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1451 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1453 if (!swap_not_matching
)
1455 swap_not_matching
= false;
1460 while (j
!= group_size
);
1462 /* Swap mismatched definition stmts. */
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_NOTE
, vect_location
,
1465 "Re-trying with swapped operands of stmts ");
1466 for (j
= 0; j
< group_size
; ++j
)
1467 if (matches
[j
] == !swap_not_matching
)
1469 std::swap (oprnds_info
[0]->def_stmts
[j
],
1470 oprnds_info
[1]->def_stmts
[j
]);
1471 std::swap (oprnds_info
[0]->ops
[j
],
1472 oprnds_info
[1]->ops
[j
]);
1473 if (dump_enabled_p ())
1474 dump_printf (MSG_NOTE
, "%d ", j
);
1476 if (dump_enabled_p ())
1477 dump_printf (MSG_NOTE
, "\n");
1478 /* And try again with scratch 'matches' ... */
1479 bool *tem
= XALLOCAVEC (bool, group_size
);
1480 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1481 group_size
, &this_max_nunits
,
1483 &this_tree_size
, bst_map
)) != NULL
)
1485 oprnd_info
->def_stmts
= vNULL
;
1486 children
.safe_push (child
);
1489 /* We do not undo the swapping here since it might still be
1490 the better order for the second operand in case we build
1491 the first one from scalars below. */
1496 /* If the SLP build failed and we analyze a basic-block
1497 simply treat nodes we fail to build as externally defined
1498 (and thus build vectors from the scalar defs).
1499 The cost model will reject outright expensive cases.
1500 ??? This doesn't treat cases where permutation ultimatively
1501 fails (or we don't try permutation below). Ideally we'd
1502 even compute a permutation that will end up with the maximum
1504 if (is_a
<bb_vec_info
> (vinfo
)
1505 /* ??? Rejecting patterns this way doesn't work. We'd have to
1506 do extra work to cancel the pattern so the uses see the
1508 && !is_pattern_stmt_p (stmt_info
)
1509 && !oprnd_info
->any_pattern
)
1511 /* But if there's a leading vector sized set of matching stmts
1512 fail here so we can split the group. This matches the condition
1513 vect_analyze_slp_instance uses. */
1514 /* ??? We might want to split here and combine the results to support
1515 multiple vector sizes better. */
1516 for (j
= 0; j
< group_size
; ++j
)
1519 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1521 if (dump_enabled_p ())
1522 dump_printf_loc (MSG_NOTE
, vect_location
,
1523 "Building vector operands from scalars\n");
1525 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1526 children
.safe_push (child
);
1527 oprnd_info
->ops
= vNULL
;
1528 oprnd_info
->def_stmts
= vNULL
;
1533 gcc_assert (child
== NULL
);
1534 FOR_EACH_VEC_ELT (children
, j
, child
)
1535 vect_free_slp_tree (child
);
1536 vect_free_oprnd_info (oprnds_info
);
1540 vect_free_oprnd_info (oprnds_info
);
1542 /* If we have all children of a child built up from uniform scalars
1543 then just throw that away, causing it built up from scalars.
1544 The exception is the SLP node for the vector store. */
1545 if (is_a
<bb_vec_info
> (vinfo
)
1546 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1547 /* ??? Rejecting patterns this way doesn't work. We'd have to
1548 do extra work to cancel the pattern so the uses see the
1550 && !is_pattern_stmt_p (stmt_info
))
1554 FOR_EACH_VEC_ELT (children
, j
, child
)
1555 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
1556 || !vect_slp_tree_uniform_p (child
))
1562 FOR_EACH_VEC_ELT (children
, j
, child
)
1563 vect_free_slp_tree (child
);
1565 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_NOTE
, vect_location
,
1567 "Building parent vector operands from "
1568 "scalars instead\n");
1573 *tree_size
+= this_tree_size
+ 1;
1574 *max_nunits
= this_max_nunits
;
1578 /* ??? We'd likely want to either cache in bst_map sth like
1579 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1580 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1581 explicit stmts to put in so the keying on 'stmts' doesn't
1582 work (but we have the same issue with nodes that use 'ops'). */
1583 slp_tree one
= new _slp_tree
;
1584 slp_tree two
= new _slp_tree
;
1585 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1586 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1587 SLP_TREE_VECTYPE (one
) = vectype
;
1588 SLP_TREE_VECTYPE (two
) = vectype
;
1589 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1590 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1595 /* Here we record the original defs since this
1596 node represents the final lane configuration. */
1597 node
= vect_create_new_slp_node (stmts
, 2);
1598 SLP_TREE_VECTYPE (node
) = vectype
;
1599 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1600 SLP_TREE_CHILDREN (node
).quick_push (one
);
1601 SLP_TREE_CHILDREN (node
).quick_push (two
);
1602 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1603 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1604 enum tree_code ocode
= ERROR_MARK
;
1605 stmt_vec_info ostmt_info
;
1607 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1609 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1610 if (gimple_assign_rhs_code (ostmt
) != code0
)
1612 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1613 ocode
= gimple_assign_rhs_code (ostmt
);
1617 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1619 SLP_TREE_CODE (one
) = code0
;
1620 SLP_TREE_CODE (two
) = ocode
;
1621 SLP_TREE_LANES (one
) = stmts
.length ();
1622 SLP_TREE_LANES (two
) = stmts
.length ();
1623 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1624 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1628 node
= vect_create_new_slp_node (stmts
, nops
);
1629 SLP_TREE_VECTYPE (node
) = vectype
;
1630 SLP_TREE_CHILDREN (node
).splice (children
);
1634 /* Dump a single SLP tree NODE. */
1637 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1642 stmt_vec_info stmt_info
;
1645 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1646 dump_user_location_t user_loc
= loc
.get_user_location ();
1647 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1648 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1650 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1653 estimated_poly_value (node
->max_nunits
), node
->refcnt
);
1654 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1655 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1656 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1659 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1660 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1661 dump_printf (metadata
, "%T%s ", op
,
1662 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1663 dump_printf (metadata
, "}\n");
1665 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1667 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1668 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1669 dump_printf (dump_kind
, " %u", j
);
1670 dump_printf (dump_kind
, " }\n");
1672 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1674 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1675 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1676 dump_printf (dump_kind
, " %u[%u]",
1677 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1678 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1679 dump_printf (dump_kind
, " }\n");
1681 if (SLP_TREE_CHILDREN (node
).is_empty ())
1683 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1685 dump_printf (dump_kind
, " %p", (void *)child
);
1686 dump_printf (dump_kind
, "\n");
1690 debug (slp_tree node
)
1692 debug_dump_context ctx
;
1693 vect_print_slp_tree (MSG_NOTE
,
1694 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1698 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1701 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1702 slp_tree node
, hash_set
<slp_tree
> &visited
)
1707 if (visited
.add (node
))
1710 vect_print_slp_tree (dump_kind
, loc
, node
);
1712 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1713 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1717 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1720 hash_set
<slp_tree
> visited
;
1721 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1724 /* Mark the tree rooted at NODE with PURE_SLP. */
1727 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1730 stmt_vec_info stmt_info
;
1733 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1736 if (visited
.add (node
))
1739 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1740 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
1742 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1743 vect_mark_slp_stmts (child
, visited
);
1747 vect_mark_slp_stmts (slp_tree node
)
1749 hash_set
<slp_tree
> visited
;
1750 vect_mark_slp_stmts (node
, visited
);
1753 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1756 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
1759 stmt_vec_info stmt_info
;
1762 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1765 if (visited
.add (node
))
1768 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1770 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1771 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1772 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1775 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1776 vect_mark_slp_stmts_relevant (child
, visited
);
1780 vect_mark_slp_stmts_relevant (slp_tree node
)
1782 hash_set
<slp_tree
> visited
;
1783 vect_mark_slp_stmts_relevant (node
, visited
);
1786 /* Copy the SLP subtree rooted at NODE. */
1789 slp_copy_subtree (slp_tree node
, hash_map
<slp_tree
, slp_tree
> &map
)
1794 slp_tree
©_ref
= map
.get_or_insert (node
, &existed_p
);
1798 copy_ref
= new _slp_tree
;
1799 slp_tree copy
= copy_ref
;
1800 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
1801 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
1802 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
1803 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
1804 copy
->max_nunits
= node
->max_nunits
;
1806 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1807 SLP_TREE_SCALAR_STMTS (copy
) = SLP_TREE_SCALAR_STMTS (node
).copy ();
1808 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1809 SLP_TREE_SCALAR_OPS (copy
) = SLP_TREE_SCALAR_OPS (node
).copy ();
1810 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1811 SLP_TREE_LOAD_PERMUTATION (copy
) = SLP_TREE_LOAD_PERMUTATION (node
).copy ();
1812 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1813 SLP_TREE_LANE_PERMUTATION (copy
) = SLP_TREE_LANE_PERMUTATION (node
).copy ();
1814 if (SLP_TREE_CHILDREN (node
).exists ())
1815 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
).copy ();
1816 gcc_assert (!SLP_TREE_VEC_STMTS (node
).exists ());
1819 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy
), i
, child
)
1821 SLP_TREE_CHILDREN (copy
)[i
] = slp_copy_subtree (child
, map
);
1822 SLP_TREE_CHILDREN (copy
)[i
]->refcnt
++;
1827 /* Rearrange the statements of NODE according to PERMUTATION. */
1830 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1831 vec
<unsigned> permutation
,
1832 hash_set
<slp_tree
> &visited
)
1837 if (visited
.add (node
))
1840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1841 vect_slp_rearrange_stmts (child
, group_size
, permutation
, visited
);
1843 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1845 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1846 vec
<stmt_vec_info
> tmp_stmts
;
1847 tmp_stmts
.create (group_size
);
1848 tmp_stmts
.quick_grow (group_size
);
1849 stmt_vec_info stmt_info
;
1850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1851 tmp_stmts
[permutation
[i
]] = stmt_info
;
1852 SLP_TREE_SCALAR_STMTS (node
).release ();
1853 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1855 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1857 gcc_assert (group_size
== SLP_TREE_SCALAR_OPS (node
).length ());
1859 tmp_ops
.create (group_size
);
1860 tmp_ops
.quick_grow (group_size
);
1862 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1863 tmp_ops
[permutation
[i
]] = op
;
1864 SLP_TREE_SCALAR_OPS (node
).release ();
1865 SLP_TREE_SCALAR_OPS (node
) = tmp_ops
;
1867 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1869 gcc_assert (group_size
== SLP_TREE_LANE_PERMUTATION (node
).length ());
1870 for (i
= 0; i
< group_size
; ++i
)
1871 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
1872 = permutation
[SLP_TREE_LANE_PERMUTATION (node
)[i
].second
];
1877 /* Attempt to reorder stmts in a reduction chain so that we don't
1878 require any load permutation. Return true if that was possible,
1879 otherwise return false. */
1882 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn
)
1886 slp_tree node
, load
;
1888 if (SLP_INSTANCE_LOADS (slp_instn
).is_empty ())
1891 /* Compare all the permutation sequences to the first one. We know
1892 that at least one load is permuted. */
1893 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1894 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1896 unsigned int group_size
= SLP_TREE_LOAD_PERMUTATION (node
).length ();
1897 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1899 if (!SLP_TREE_LOAD_PERMUTATION (load
).exists ()
1900 || SLP_TREE_LOAD_PERMUTATION (load
).length () != group_size
)
1902 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load
), j
, lidx
)
1903 if (lidx
!= SLP_TREE_LOAD_PERMUTATION (node
)[j
])
1907 /* Check that the loads in the first sequence are different and there
1908 are no gaps between them and that there is an actual permutation. */
1909 bool any_permute
= false;
1910 auto_sbitmap
load_index (group_size
);
1911 bitmap_clear (load_index
);
1912 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1916 if (lidx
>= group_size
)
1918 if (bitmap_bit_p (load_index
, lidx
))
1921 bitmap_set_bit (load_index
, lidx
);
1925 for (i
= 0; i
< group_size
; i
++)
1926 if (!bitmap_bit_p (load_index
, i
))
1929 /* This permutation is valid for reduction. Since the order of the
1930 statements in the nodes is not important unless they are memory
1931 accesses, we can rearrange the statements in all the nodes
1932 according to the order of the loads. */
1934 /* We have to unshare the SLP tree we modify. */
1935 hash_map
<slp_tree
, slp_tree
> map
;
1936 slp_tree unshared
= slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn
), map
);
1937 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn
));
1939 SLP_INSTANCE_TREE (slp_instn
) = unshared
;
1940 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1941 SLP_INSTANCE_LOADS (slp_instn
)[i
] = *map
.get (node
);
1942 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1944 /* Do the actual re-arrangement. */
1945 hash_set
<slp_tree
> visited
;
1946 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1947 node
->load_permutation
, visited
);
1949 /* We are done, no actual permutations need to be generated. */
1950 poly_uint64 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
);
1951 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1953 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1954 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
1955 /* But we have to keep those permutations that are required because
1956 of handling of gaps. */
1957 if (known_eq (unrolling_factor
, 1U)
1958 || (group_size
== DR_GROUP_SIZE (first_stmt_info
)
1959 && DR_GROUP_GAP (first_stmt_info
) == 0))
1960 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1962 for (j
= 0; j
< SLP_TREE_LOAD_PERMUTATION (node
).length (); ++j
)
1963 SLP_TREE_LOAD_PERMUTATION (node
)[j
] = j
;
1969 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1972 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
1973 hash_set
<slp_tree
> &visited
)
1975 if (visited
.add (node
))
1978 if (SLP_TREE_CHILDREN (node
).length () == 0)
1980 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1982 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1983 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1984 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1985 loads
.safe_push (node
);
1991 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1992 vect_gather_slp_loads (loads
, child
, visited
);
1997 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
1999 hash_set
<slp_tree
> visited
;
2000 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
2004 /* Find the last store in SLP INSTANCE. */
2007 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2009 stmt_vec_info last
= NULL
;
2010 stmt_vec_info stmt_vinfo
;
2012 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2014 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2015 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2021 /* Find the first stmt in NODE. */
2024 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2026 stmt_vec_info first
= NULL
;
2027 stmt_vec_info stmt_vinfo
;
2029 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2031 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2033 || get_later_stmt (stmt_vinfo
, first
) == first
)
2040 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2041 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2042 (also containing the first GROUP1_SIZE stmts, since stores are
2043 consecutive), the second containing the remainder.
2044 Return the first stmt in the second group. */
2046 static stmt_vec_info
2047 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2049 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2050 gcc_assert (group1_size
> 0);
2051 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2052 gcc_assert (group2_size
> 0);
2053 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2055 stmt_vec_info stmt_info
= first_vinfo
;
2056 for (unsigned i
= group1_size
; i
> 1; i
--)
2058 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2059 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2061 /* STMT is now the last element of the first group. */
2062 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2063 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2065 DR_GROUP_SIZE (group2
) = group2_size
;
2066 for (stmt_info
= group2
; stmt_info
;
2067 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2069 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2070 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2073 /* For the second group, the DR_GROUP_GAP is that before the original group,
2074 plus skipping over the first vector. */
2075 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2077 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2078 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2080 if (dump_enabled_p ())
2081 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2082 group1_size
, group2_size
);
2087 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2088 statements and a vector of NUNITS elements. */
2091 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2093 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2096 /* Analyze an SLP instance starting from a group of grouped stores. Call
2097 vect_build_slp_tree to build a tree of packed stmts if possible.
2098 Return FALSE if it's impossible to SLP any stmt in the loop. */
2101 vect_analyze_slp_instance (vec_info
*vinfo
,
2102 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2103 stmt_vec_info stmt_info
, unsigned max_tree_size
)
2105 slp_instance new_instance
;
2107 unsigned int group_size
;
2108 tree vectype
, scalar_type
= NULL_TREE
;
2110 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
2111 vec
<stmt_vec_info
> scalar_stmts
;
2112 bool constructor
= false;
2114 if (is_a
<bb_vec_info
> (vinfo
))
2115 vect_location
= stmt_info
->stmt
;
2116 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2118 scalar_type
= TREE_TYPE (DR_REF (dr
));
2119 group_size
= DR_GROUP_SIZE (stmt_info
);
2120 vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
, group_size
);
2122 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2124 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2125 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2126 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2128 else if (is_gimple_assign (stmt_info
->stmt
)
2129 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2131 vectype
= TREE_TYPE (gimple_assign_rhs1 (stmt_info
->stmt
));
2132 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2137 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2138 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2139 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2146 "Build SLP failed: unsupported data-type %T\n",
2151 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2153 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2154 scalar_stmts
.create (group_size
);
2155 stmt_vec_info next_info
= stmt_info
;
2156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2158 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2161 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2162 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2165 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2167 /* Collect the reduction stmts and store them in
2168 SLP_TREE_SCALAR_STMTS. */
2171 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2172 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2174 /* Mark the first element of the reduction chain as reduction to properly
2175 transform the node. In the reduction analysis phase only the last
2176 element of the chain is marked as reduction. */
2177 STMT_VINFO_DEF_TYPE (stmt_info
)
2178 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2179 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2180 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2182 else if (constructor
)
2184 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2186 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2188 if (TREE_CODE (val
) == SSA_NAME
)
2190 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2191 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2192 /* Value is defined in another basic block. */
2195 def_info
= vect_stmt_to_vectorize (def_info
);
2196 scalar_stmts
.safe_push (def_info
);
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_NOTE
, vect_location
,
2203 "Analyzing vectorizable constructor: %G\n",
2208 /* Collect reduction statements. */
2209 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2210 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2211 scalar_stmts
.safe_push (next_info
);
2214 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE
, vect_location
,
2217 "Starting SLP discovery for\n");
2218 for (i
= 0; i
< scalar_stmts
.length (); ++i
)
2219 dump_printf_loc (MSG_NOTE
, vect_location
,
2220 " %G", scalar_stmts
[i
]->stmt
);
2223 /* Build the tree for the SLP instance. */
2224 bool *matches
= XALLOCAVEC (bool, group_size
);
2225 unsigned npermutes
= 0;
2226 poly_uint64 max_nunits
= nunits
;
2227 unsigned tree_size
= 0;
2228 node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2229 &max_nunits
, matches
, &npermutes
,
2230 &tree_size
, bst_map
);
2233 /* Calculate the unrolling factor based on the smallest type. */
2234 poly_uint64 unrolling_factor
2235 = calculate_unrolling_factor (max_nunits
, group_size
);
2237 if (maybe_ne (unrolling_factor
, 1U)
2238 && is_a
<bb_vec_info
> (vinfo
))
2240 unsigned HOST_WIDE_INT const_max_nunits
;
2241 if (!max_nunits
.is_constant (&const_max_nunits
)
2242 || const_max_nunits
> group_size
)
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2246 "Build SLP failed: store group "
2247 "size not a multiple of the vector size "
2248 "in basic block SLP\n");
2249 vect_free_slp_tree (node
);
2252 /* Fatal mismatch. */
2253 if (dump_enabled_p ())
2254 dump_printf_loc (MSG_NOTE
, vect_location
,
2255 "SLP discovery succeeded but node needs "
2258 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2259 vect_free_slp_tree (node
);
2263 /* Create a new SLP instance. */
2264 new_instance
= XNEW (class _slp_instance
);
2265 SLP_INSTANCE_TREE (new_instance
) = node
;
2266 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2267 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2268 SLP_INSTANCE_ROOT_STMT (new_instance
) = constructor
? stmt_info
: NULL
;
2269 new_instance
->reduc_phis
= NULL
;
2270 new_instance
->cost_vec
= vNULL
;
2271 new_instance
->subgraph_entries
= vNULL
;
2273 vect_gather_slp_loads (new_instance
, node
);
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_NOTE
, vect_location
,
2276 "SLP size %u vs. limit %u.\n",
2277 tree_size
, max_tree_size
);
2279 /* Check whether any load is possibly permuted. */
2281 bool loads_permuted
= false;
2282 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2284 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2287 stmt_vec_info load_info
;
2288 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2289 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2291 loads_permuted
= true;
2296 /* If the loads and stores can be handled with load/store-lane
2297 instructions do not generate this SLP instance. */
2298 if (is_a
<loop_vec_info
> (vinfo
)
2300 && dr
&& vect_store_lanes_supported (vectype
, group_size
, false))
2303 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2305 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2306 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2307 /* Use SLP for strided accesses (or if we can't load-lanes). */
2308 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2309 || ! vect_load_lanes_supported
2310 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2311 DR_GROUP_SIZE (stmt_vinfo
), false))
2314 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2316 if (dump_enabled_p ())
2317 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2318 "Built SLP cancelled: can use "
2319 "load/store-lanes\n");
2320 vect_free_slp_instance (new_instance
);
2325 /* If this is a reduction chain with a conversion in front
2326 amend the SLP tree with a node for that. */
2328 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
2329 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
)
2331 /* Get at the conversion stmt - we know it's the single use
2332 of the last stmt of the reduction chain. */
2333 gimple
*tem
= vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2334 use_operand_p use_p
;
2336 bool r
= single_imm_use (gimple_assign_lhs (tem
),
2339 next_info
= vinfo
->lookup_stmt (use_stmt
);
2340 next_info
= vect_stmt_to_vectorize (next_info
);
2341 scalar_stmts
= vNULL
;
2342 scalar_stmts
.create (group_size
);
2343 for (unsigned i
= 0; i
< group_size
; ++i
)
2344 scalar_stmts
.quick_push (next_info
);
2345 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2346 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2347 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2348 SLP_INSTANCE_TREE (new_instance
) = conv
;
2349 /* We also have to fake this conversion stmt as SLP reduction
2350 group so we don't have to mess with too much code
2352 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2353 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2356 vinfo
->slp_instances
.safe_push (new_instance
);
2358 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2359 the number of scalar stmts in the root in a few places.
2360 Verify that assumption holds. */
2361 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2362 .length () == group_size
);
2364 if (dump_enabled_p ())
2366 dump_printf_loc (MSG_NOTE
, vect_location
,
2367 "Final SLP tree for instance:\n");
2368 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2369 SLP_INSTANCE_TREE (new_instance
));
2377 /* Failed to SLP. */
2378 /* Free the allocated memory. */
2379 scalar_stmts
.release ();
2382 /* For basic block SLP, try to break the group up into multiples of the
2384 unsigned HOST_WIDE_INT const_nunits
;
2385 if (is_a
<bb_vec_info
> (vinfo
)
2386 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2387 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
))
2388 && nunits
.is_constant (&const_nunits
))
2390 /* We consider breaking the group only on VF boundaries from the existing
2392 for (i
= 0; i
< group_size
; i
++)
2393 if (!matches
[i
]) break;
2395 if (i
>= const_nunits
&& i
< group_size
)
2397 /* Split into two groups at the first vector boundary before i. */
2398 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2399 unsigned group1_size
= i
& ~(const_nunits
- 1);
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_NOTE
, vect_location
,
2403 "Splitting SLP group at stmt %u\n", i
);
2404 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2406 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2408 /* If the first non-match was in the middle of a vector,
2409 skip the rest of that vector. */
2410 if (group1_size
< i
)
2412 i
= group1_size
+ const_nunits
;
2414 rest
= vect_split_slp_store_group (rest
, const_nunits
);
2417 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2418 rest
, max_tree_size
);
2421 /* Even though the first vector did not all match, we might be able to SLP
2422 (some) of the remainder. FORNOW ignore this possibility. */
2425 /* Failed to SLP. */
2426 if (dump_enabled_p ())
2427 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2432 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2433 trees of packed scalar stmts if SLP is possible. */
2436 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2439 stmt_vec_info first_element
;
2441 DUMP_VECT_SCOPE ("vect_analyze_slp");
2443 scalar_stmts_to_slp_tree_map_t
*bst_map
2444 = new scalar_stmts_to_slp_tree_map_t ();
2446 /* Find SLP sequences starting from groups of grouped stores. */
2447 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2448 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2450 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2452 if (loop_vinfo
->reduction_chains
.length () > 0)
2454 /* Find SLP sequences starting from reduction chains. */
2455 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2456 if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2459 /* Dissolve reduction chain group. */
2460 stmt_vec_info vinfo
= first_element
;
2461 stmt_vec_info last
= NULL
;
2464 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2465 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2466 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2470 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2471 /* It can be still vectorized as part of an SLP reduction. */
2472 loop_vinfo
->reductions
.safe_push (last
);
2476 /* Find SLP sequences starting from groups of reductions. */
2477 if (loop_vinfo
->reductions
.length () > 1)
2478 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2482 /* The map keeps a reference on SLP nodes built, release that. */
2483 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2484 it
!= bst_map
->end (); ++it
)
2486 vect_free_slp_tree ((*it
).second
);
2489 /* Optimize permutations in SLP reductions. */
2490 slp_instance instance
;
2491 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2493 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2494 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2495 /* Reduction (there are no data-refs in the root).
2496 In reduction chain the order of the loads is not important. */
2497 if (!STMT_VINFO_DATA_REF (stmt_info
)
2498 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2499 vect_attempt_slp_rearrange_stmts (instance
);
2502 /* Gather all loads in the SLP graph. */
2503 hash_set
<slp_tree
> visited
;
2504 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2505 vect_gather_slp_loads (vinfo
->slp_loads
, SLP_INSTANCE_TREE (instance
),
2508 return opt_result::success ();
2512 vect_optimize_slp (vec_info
*vinfo
)
2516 FOR_EACH_VEC_ELT (vinfo
->slp_loads
, i
, node
)
2518 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2521 /* In basic block vectorization we allow any subchain of an interleaving
2523 FORNOW: not in loop SLP because of realignment complications. */
2524 if (is_a
<bb_vec_info
> (vinfo
))
2526 bool subchain_p
= true;
2527 stmt_vec_info next_load_info
= NULL
;
2528 stmt_vec_info load_info
;
2530 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2533 && (next_load_info
!= load_info
2534 || DR_GROUP_GAP (load_info
) != 1))
2539 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
2543 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2549 stmt_vec_info load_info
;
2550 bool this_load_permuted
= false;
2552 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2553 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
2555 this_load_permuted
= true;
2558 stmt_vec_info first_stmt_info
2559 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
2560 if (!this_load_permuted
2561 /* The load requires permutation when unrolling exposes
2562 a gap either because the group is larger than the SLP
2563 group-size or because there is a gap between the groups. */
2564 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a
<loop_vec_info
> (vinfo
)), 1U)
2565 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
2566 && DR_GROUP_GAP (first_stmt_info
) == 0)))
2568 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2576 /* For each possible SLP instance decide whether to SLP it and calculate overall
2577 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2578 least one instance. */
2581 vect_make_slp_decision (loop_vec_info loop_vinfo
)
2584 poly_uint64 unrolling_factor
= 1;
2585 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2586 slp_instance instance
;
2587 int decided_to_slp
= 0;
2589 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2591 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2593 /* FORNOW: SLP if you can. */
2594 /* All unroll factors have the form:
2596 GET_MODE_SIZE (vinfo->vector_mode) * X
2598 for some rational X, so they must have a common multiple. */
2600 = force_common_multiple (unrolling_factor
,
2601 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
2603 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2604 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2605 loop-based vectorization. Such stmts will be marked as HYBRID. */
2606 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
2610 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
2612 if (decided_to_slp
&& dump_enabled_p ())
2614 dump_printf_loc (MSG_NOTE
, vect_location
,
2615 "Decided to SLP %d instances. Unrolling factor ",
2617 dump_dec (MSG_NOTE
, unrolling_factor
);
2618 dump_printf (MSG_NOTE
, "\n");
2621 return (decided_to_slp
> 0);
2624 /* Private data for vect_detect_hybrid_slp. */
2627 loop_vec_info loop_vinfo
;
2628 vec
<stmt_vec_info
> *worklist
;
2631 /* Walker for walk_gimple_op. */
2634 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
2636 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
2637 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
2642 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
2645 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
2646 if (PURE_SLP_STMT (def_stmt_info
))
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
2650 def_stmt_info
->stmt
);
2651 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
2652 dat
->worklist
->safe_push (def_stmt_info
);
2658 /* Find stmts that must be both vectorized and SLPed. */
2661 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
2663 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2665 /* All stmts participating in SLP are marked pure_slp, all other
2666 stmts are loop_vect.
2667 First collect all loop_vect stmts into a worklist. */
2668 auto_vec
<stmt_vec_info
> worklist
;
2669 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2671 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2672 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
2675 gphi
*phi
= gsi
.phi ();
2676 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2677 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2678 worklist
.safe_push (stmt_info
);
2680 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
2683 gimple
*stmt
= gsi_stmt (gsi
);
2684 if (is_gimple_debug (stmt
))
2686 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
2687 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2689 for (gimple_stmt_iterator gsi2
2690 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2691 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
2693 stmt_vec_info patt_info
2694 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
2695 if (!STMT_SLP_TYPE (patt_info
)
2696 && STMT_VINFO_RELEVANT (patt_info
))
2697 worklist
.safe_push (patt_info
);
2699 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2701 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2702 worklist
.safe_push (stmt_info
);
2706 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2707 mark any SLP vectorized stmt as hybrid.
2708 ??? We're visiting def stmts N times (once for each non-SLP and
2709 once for each hybrid-SLP use). */
2712 dat
.worklist
= &worklist
;
2713 dat
.loop_vinfo
= loop_vinfo
;
2714 memset (&wi
, 0, sizeof (wi
));
2715 wi
.info
= (void *)&dat
;
2716 while (!worklist
.is_empty ())
2718 stmt_vec_info stmt_info
= worklist
.pop ();
2719 /* Since SSA operands are not set up for pattern stmts we need
2720 to use walk_gimple_op. */
2722 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
2727 /* Initialize a bb_vec_info struct for the statements between
2728 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2730 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in
,
2731 gimple_stmt_iterator region_end_in
,
2732 vec_info_shared
*shared
)
2733 : vec_info (vec_info::bb
, init_cost (NULL
), shared
),
2734 bb (gsi_bb (region_begin_in
)),
2735 region_begin (region_begin_in
),
2736 region_end (region_end_in
)
2738 for (gimple
*stmt
: this->region_stmts ())
2740 gimple_set_uid (stmt
, 0);
2741 if (is_gimple_debug (stmt
))
2750 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2751 stmts in the basic block. */
2753 _bb_vec_info::~_bb_vec_info ()
2755 for (gimple
*stmt
: this->region_stmts ())
2756 /* Reset region marker. */
2757 gimple_set_uid (stmt
, -1);
2762 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2763 given then that child nodes have already been processed, and that
2764 their def types currently match their SLP node's def type. */
2767 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
2768 slp_instance node_instance
,
2769 stmt_vector_for_cost
*cost_vec
)
2771 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
2772 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
2774 /* Calculate the number of vector statements to be created for the
2775 scalar stmts in this node. For SLP reductions it is equal to the
2776 number of vector statements in the children (which has already been
2777 calculated by the recursive call). Otherwise it is the number of
2778 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2779 VF divided by the number of elements in a vector. */
2780 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2781 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2783 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
2784 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
2786 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2787 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
2794 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2795 vf
= loop_vinfo
->vectorization_factor
;
2798 unsigned int group_size
= SLP_TREE_LANES (node
);
2799 tree vectype
= SLP_TREE_VECTYPE (node
);
2800 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2801 = vect_get_num_vectors (vf
* group_size
, vectype
);
2804 /* Handle purely internal nodes. */
2805 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2806 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
2808 if (is_a
<bb_vec_info
> (vinfo
)
2809 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
2813 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
2814 node
, node_instance
, cost_vec
);
2817 /* Try to build NODE from scalars, returning true on success.
2818 NODE_INSTANCE is the SLP instance that contains NODE. */
2821 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
2822 slp_instance node_instance
)
2824 stmt_vec_info stmt_info
;
2827 if (!is_a
<bb_vec_info
> (vinfo
)
2828 || node
== SLP_INSTANCE_TREE (node_instance
)
2829 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
2830 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
2833 if (dump_enabled_p ())
2834 dump_printf_loc (MSG_NOTE
, vect_location
,
2835 "Building vector operands from scalars instead\n");
2837 /* Don't remove and free the child nodes here, since they could be
2838 referenced by other structures. The analysis and scheduling phases
2839 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2840 unsigned int group_size
= SLP_TREE_LANES (node
);
2841 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
2842 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
2843 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2845 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
2846 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
2851 /* Compute the prologue cost for invariant or constant operands represented
2855 vect_prologue_cost_for_slp (slp_tree node
,
2856 stmt_vector_for_cost
*cost_vec
)
2858 /* There's a special case of an existing vector, that costs nothing. */
2859 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
2860 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
2862 /* Without looking at the actual initializer a vector of
2863 constants can be implemented as load from the constant pool.
2864 When all elements are the same we can use a splat. */
2865 tree vectype
= SLP_TREE_VECTYPE (node
);
2866 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
2867 unsigned num_vects_to_check
;
2868 unsigned HOST_WIDE_INT const_nunits
;
2869 unsigned nelt_limit
;
2870 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
2871 && ! multiple_p (const_nunits
, group_size
))
2873 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
2874 nelt_limit
= const_nunits
;
2878 /* If either the vector has variable length or the vectors
2879 are composed of repeated whole groups we only need to
2880 cost construction once. All vectors will be the same. */
2881 num_vects_to_check
= 1;
2882 nelt_limit
= group_size
;
2884 tree elt
= NULL_TREE
;
2886 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
2888 unsigned si
= j
% group_size
;
2890 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
2891 /* ??? We're just tracking whether all operands of a single
2892 vector initializer are the same, ideally we'd check if
2893 we emitted the same one already. */
2894 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
2897 if (nelt
== nelt_limit
)
2899 record_stmt_cost (cost_vec
, 1,
2900 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2901 ? (elt
? scalar_to_vec
: vec_construct
)
2903 NULL
, vectype
, 0, vect_prologue
);
2909 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2910 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2912 Return true if the operations are supported. */
2915 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
2916 slp_instance node_instance
,
2917 hash_set
<slp_tree
> &visited
,
2918 hash_set
<slp_tree
> &lvisited
,
2919 stmt_vector_for_cost
*cost_vec
)
2924 /* Assume we can code-generate all invariants. */
2925 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2928 /* If we already analyzed the exact same set of scalar stmts we're done.
2929 We share the generated vector stmts for those.
2930 The SLP graph is acyclic so not caching whether we failed or succeeded
2931 doesn't result in any issue since we throw away the lvisited set
2933 if (visited
.contains (node
)
2934 || lvisited
.contains (node
))
2938 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2940 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
2941 visited
, lvisited
, cost_vec
);
2948 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
2951 lvisited
.add (node
);
2954 /* When the node can be vectorized cost invariant nodes it references.
2955 This is not done in DFS order to allow the refering node
2956 vectorizable_* calls to nail down the invariant nodes vector type
2957 and possibly unshare it if it needs a different vector type than
2960 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2961 if ((SLP_TREE_DEF_TYPE (child
) == vect_constant_def
2962 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2963 /* Perform usual caching, note code-generation still
2964 code-gens these nodes multiple times but we expect
2965 to CSE them later. */
2966 && !visited
.contains (child
)
2967 && !lvisited
.add (child
))
2969 /* ??? After auditing more code paths make a "default"
2970 and push the vector type from NODE to all children
2971 if it is not already set. */
2972 /* Compute the number of vectors to be generated. */
2973 tree vector_type
= SLP_TREE_VECTYPE (child
);
2976 /* For shifts with a scalar argument we don't need
2977 to cost or code-generate anything.
2978 ??? Represent this more explicitely. */
2979 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
2980 == shift_vec_info_type
)
2984 unsigned group_size
= SLP_TREE_LANES (child
);
2986 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2987 vf
= loop_vinfo
->vectorization_factor
;
2988 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
2989 = vect_get_num_vectors (vf
* group_size
, vector_type
);
2990 /* And cost them. */
2991 vect_prologue_cost_for_slp (child
, cost_vec
);
2994 /* If this node or any of its children can't be vectorized, try pruning
2995 the tree here rather than felling the whole thing. */
2996 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
2998 /* We'll need to revisit this for invariant costing and number
2999 of vectorized stmt setting. */
3007 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3008 region and that can be vectorized using vectorizable_live_operation
3009 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3010 scalar code computing it to be retained. */
3013 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3014 slp_instance instance
,
3015 stmt_vector_for_cost
*cost_vec
,
3016 hash_set
<stmt_vec_info
> &svisited
)
3019 stmt_vec_info stmt_info
;
3020 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3021 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3023 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3024 if (svisited
.contains (orig_stmt_info
))
3026 bool mark_visited
= true;
3027 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3028 ssa_op_iter op_iter
;
3029 def_operand_p def_p
;
3030 FOR_EACH_SSA_DEF_OPERAND (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3032 imm_use_iterator use_iter
;
3034 stmt_vec_info use_stmt_info
;
3035 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3036 if (!is_gimple_debug (use_stmt
))
3038 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3040 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3042 STMT_VINFO_LIVE_P (stmt_info
) = true;
3043 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3044 NULL
, node
, instance
, i
,
3046 /* ??? So we know we can vectorize the live stmt
3047 from one SLP node. If we cannot do so from all
3048 or none consistently we'd have to record which
3049 SLP node (and lane) we want to use for the live
3050 operation. So make sure we can code-generate
3052 mark_visited
= false;
3054 STMT_VINFO_LIVE_P (stmt_info
) = false;
3055 BREAK_FROM_IMM_USE_STMT (use_iter
);
3058 /* We have to verify whether we can insert the lane extract
3059 before all uses. The following is a conservative approximation.
3060 We cannot put this into vectorizable_live_operation because
3061 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3063 Note that while the fact that we emit code for loads at the
3064 first load should make this a non-problem leafs we construct
3065 from scalars are vectorized after the last scalar def.
3066 ??? If we'd actually compute the insert location during
3067 analysis we could use sth less conservative than the last
3068 scalar stmt in the node for the dominance check. */
3069 /* ??? What remains is "live" uses in vector CTORs in the same
3070 SLP graph which is where those uses can end up code-generated
3071 right after their definition instead of close to their original
3072 use. But that would restrict us to code-generate lane-extracts
3073 from the latest stmt in a node. So we compensate for this
3074 during code-generation, simply not replacing uses for those
3075 hopefully rare cases. */
3076 if (STMT_VINFO_LIVE_P (stmt_info
))
3077 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3078 if (!is_gimple_debug (use_stmt
)
3079 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3080 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3081 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3085 "Cannot determine insertion place for "
3087 STMT_VINFO_LIVE_P (stmt_info
) = false;
3088 mark_visited
= true;
3092 svisited
.add (orig_stmt_info
);
3096 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3097 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3098 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3099 cost_vec
, svisited
);
3102 /* Analyze statements in SLP instances of VINFO. Return true if the
3103 operations are supported. */
3106 vect_slp_analyze_operations (vec_info
*vinfo
)
3108 slp_instance instance
;
3111 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3113 hash_set
<slp_tree
> visited
;
3114 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3116 hash_set
<slp_tree
> lvisited
;
3117 stmt_vector_for_cost cost_vec
;
3118 cost_vec
.create (2);
3119 if (is_a
<bb_vec_info
> (vinfo
))
3120 vect_location
= instance
->location ();
3121 if (!vect_slp_analyze_node_operations (vinfo
,
3122 SLP_INSTANCE_TREE (instance
),
3123 instance
, visited
, lvisited
,
3125 /* Instances with a root stmt require vectorized defs for the
3127 || (SLP_INSTANCE_ROOT_STMT (instance
)
3128 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3129 != vect_internal_def
)))
3131 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3132 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3133 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_NOTE
, vect_location
,
3135 "removing SLP instance operations starting from: %G",
3137 vect_free_slp_instance (instance
);
3138 vinfo
->slp_instances
.ordered_remove (i
);
3139 cost_vec
.release ();
3143 for (hash_set
<slp_tree
>::iterator x
= lvisited
.begin();
3144 x
!= lvisited
.end(); ++x
)
3148 /* For BB vectorization remember the SLP graph entry
3150 if (is_a
<bb_vec_info
> (vinfo
))
3151 instance
->cost_vec
= cost_vec
;
3154 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3155 cost_vec
.release ();
3160 /* Compute vectorizable live stmts. */
3161 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3163 hash_set
<stmt_vec_info
> svisited
;
3164 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3166 vect_location
= instance
->location ();
3167 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3168 instance
, &instance
->cost_vec
, svisited
);
3172 return !vinfo
->slp_instances
.is_empty ();
3175 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3176 closing the eventual chain. */
3179 get_ultimate_leader (slp_instance instance
,
3180 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3182 auto_vec
<slp_instance
*, 8> chain
;
3184 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3186 chain
.safe_push (tem
);
3189 while (!chain
.is_empty ())
3190 *chain
.pop () = instance
;
3194 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3197 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3198 slp_instance instance
, slp_tree node
,
3199 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3200 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3202 stmt_vec_info stmt_info
;
3205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3208 slp_instance
&stmt_instance
3209 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
3212 else if (stmt_instance
!= instance
)
3214 /* If we're running into a previously marked stmt make us the
3215 leader of the current ultimate leader. This keeps the
3216 leader chain acyclic and works even when the current instance
3217 connects two previously independent graph parts. */
3218 slp_instance stmt_leader
3219 = get_ultimate_leader (stmt_instance
, instance_leader
);
3220 if (stmt_leader
!= instance
)
3221 instance_leader
.put (stmt_leader
, instance
);
3223 stmt_instance
= instance
;
3225 /* If not all stmts had been visited we have to recurse on children. */
3230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3231 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3232 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
3236 /* Partition the SLP graph into pieces that can be costed independently. */
3239 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
3241 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
3243 /* First walk the SLP graph assigning each involved scalar stmt a
3244 corresponding SLP graph entry and upon visiting a previously
3245 marked stmt, make the stmts leader the current SLP graph entry. */
3246 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
3247 hash_map
<slp_instance
, slp_instance
> instance_leader
;
3248 slp_instance instance
;
3249 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3251 instance_leader
.put (instance
, instance
);
3252 vect_bb_partition_graph_r (bb_vinfo
,
3253 instance
, SLP_INSTANCE_TREE (instance
),
3254 stmt_to_instance
, instance_leader
);
3257 /* Then collect entries to each independent subgraph. */
3258 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3260 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
3261 leader
->subgraph_entries
.safe_push (instance
);
3262 if (dump_enabled_p ()
3263 && leader
!= instance
)
3264 dump_printf_loc (MSG_NOTE
, vect_location
,
3265 "instance %p is leader of %p\n",
3270 /* Compute the scalar cost of the SLP node NODE and its children
3271 and return it. Do not account defs that are marked in LIFE and
3272 update LIFE according to uses of NODE. */
3275 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
3276 slp_tree node
, vec
<bool, va_heap
> *life
,
3277 stmt_vector_for_cost
*cost_vec
,
3278 hash_set
<slp_tree
> &visited
)
3281 stmt_vec_info stmt_info
;
3284 if (visited
.add (node
))
3287 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3289 ssa_op_iter op_iter
;
3290 def_operand_p def_p
;
3295 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3296 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3298 /* If there is a non-vectorized use of the defs then the scalar
3299 stmt is kept live in which case we do not account it or any
3300 required defs in the SLP children in the scalar cost. This
3301 way we make the vectorization more costly when compared to
3303 if (!STMT_VINFO_LIVE_P (stmt_info
))
3305 FOR_EACH_SSA_DEF_OPERAND (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3307 imm_use_iterator use_iter
;
3309 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3310 if (!is_gimple_debug (use_stmt
))
3312 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
3315 (vect_stmt_to_vectorize (use_stmt_info
)))
3318 BREAK_FROM_IMM_USE_STMT (use_iter
);
3326 /* Count scalar stmts only once. */
3327 if (gimple_visited_p (orig_stmt
))
3329 gimple_set_visited (orig_stmt
, true);
3331 vect_cost_for_stmt kind
;
3332 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
3334 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
3337 kind
= scalar_store
;
3339 else if (vect_nop_conversion_p (orig_stmt_info
))
3343 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
, 0, vect_body
);
3346 auto_vec
<bool, 20> subtree_life
;
3347 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3349 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3351 /* Do not directly pass LIFE to the recursive call, copy it to
3352 confine changes in the callee to the current child/subtree. */
3353 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3355 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
3356 for (unsigned j
= 0;
3357 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
3359 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
3360 if (perm
.first
== i
)
3361 subtree_life
[perm
.second
] = (*life
)[j
];
3366 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
3367 subtree_life
.safe_splice (*life
);
3369 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
3371 subtree_life
.truncate (0);
3376 /* Check if vectorization of the basic block is profitable for the
3377 subgraph denoted by SLP_INSTANCES. */
3380 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
3381 vec
<slp_instance
> slp_instances
)
3383 slp_instance instance
;
3385 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3386 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3388 void *vect_target_cost_data
= init_cost (NULL
);
3390 /* Calculate scalar cost and sum the cost for the vector stmts
3391 previously collected. */
3392 stmt_vector_for_cost scalar_costs
;
3393 scalar_costs
.create (0);
3394 hash_set
<slp_tree
> visited
;
3395 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3397 auto_vec
<bool, 20> life
;
3398 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
3400 vect_bb_slp_scalar_cost (bb_vinfo
,
3401 SLP_INSTANCE_TREE (instance
),
3402 &life
, &scalar_costs
, visited
);
3403 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
3404 instance
->cost_vec
.release ();
3406 /* Unset visited flag. */
3407 stmt_info_for_cost
*si
;
3408 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
3409 gimple_set_visited (si
->stmt_info
->stmt
, false);
3411 void *scalar_target_cost_data
= init_cost (NULL
);
3412 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
3413 scalar_costs
.release ();
3415 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
3416 destroy_cost_data (scalar_target_cost_data
);
3418 /* Complete the target-specific vector cost calculation. */
3419 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
3420 &vec_inside_cost
, &vec_epilogue_cost
);
3421 destroy_cost_data (vect_target_cost_data
);
3423 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
3425 if (dump_enabled_p ())
3427 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3428 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
3430 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
3431 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
3432 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
3435 /* Vectorization is profitable if its cost is more than the cost of scalar
3436 version. Note that we err on the vector side for equal cost because
3437 the cost estimate is otherwise quite pessimistic (constant uses are
3438 free on the scalar side but cost a load on the vector side for
3440 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
3446 /* Find any vectorizable constructors and add them to the grouped_store
3450 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
3452 for (gimple
*stmt
: bb_vinfo
->region_stmts ())
3454 gassign
*assign
= dyn_cast
<gassign
*> (stmt
);
3455 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
3458 tree rhs
= gimple_assign_rhs1 (assign
);
3459 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
3460 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
3461 CONSTRUCTOR_NELTS (rhs
))
3462 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
3463 || uniform_vector_p (rhs
))
3466 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
3467 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3471 /* Check if the region described by BB_VINFO can be vectorized, returning
3472 true if so. When returning false, set FATAL to true if the same failure
3473 would prevent vectorization at other vector sizes, false if it is still
3474 worth trying other sizes. N_STMTS is the number of statements in the
3478 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
3479 vec
<int> *dataref_groups
)
3481 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3483 slp_instance instance
;
3485 poly_uint64 min_vf
= 2;
3487 /* The first group of checks is independent of the vector size. */
3490 /* Analyze the data references. */
3492 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
3494 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3496 "not vectorized: unhandled data-ref in basic "
3501 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3505 "not vectorized: unhandled data access in "
3510 vect_slp_check_for_constructors (bb_vinfo
);
3512 /* If there are no grouped stores and no constructors in the region
3513 there is no need to continue with pattern recog as vect_analyze_slp
3514 will fail anyway. */
3515 if (bb_vinfo
->grouped_stores
.is_empty ())
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3519 "not vectorized: no grouped stores in "
3524 /* While the rest of the analysis below depends on it in some way. */
3527 vect_pattern_recog (bb_vinfo
);
3529 /* Check the SLP opportunities in the basic block, analyze and build SLP
3531 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
3533 if (dump_enabled_p ())
3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3536 "Failed to SLP the basic block.\n");
3537 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3538 "not vectorized: failed to find SLP opportunities "
3539 "in basic block.\n");
3544 /* Optimize permutations. */
3545 vect_optimize_slp (bb_vinfo
);
3547 vect_record_base_alignments (bb_vinfo
);
3549 /* Analyze and verify the alignment of data references and the
3550 dependence in the SLP instances. */
3551 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
3553 vect_location
= instance
->location ();
3554 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
3555 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
3557 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3558 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3559 if (dump_enabled_p ())
3560 dump_printf_loc (MSG_NOTE
, vect_location
,
3561 "removing SLP instance operations starting from: %G",
3563 vect_free_slp_instance (instance
);
3564 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
3568 /* Mark all the statements that we want to vectorize as pure SLP and
3570 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3571 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
3572 if (SLP_INSTANCE_ROOT_STMT (instance
))
3573 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
3577 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
3580 if (!vect_slp_analyze_operations (bb_vinfo
))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3584 "not vectorized: bad operation in basic block.\n");
3588 vect_bb_partition_graph (bb_vinfo
);
3593 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3594 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3595 on success. The region has N_STMTS statements and has the datarefs
3596 given by DATAREFS. */
3599 vect_slp_region (gimple_stmt_iterator region_begin
,
3600 gimple_stmt_iterator region_end
,
3601 vec
<data_reference_p
> datarefs
,
3602 vec
<int> *dataref_groups
,
3603 unsigned int n_stmts
)
3605 bb_vec_info bb_vinfo
;
3606 auto_vector_modes vector_modes
;
3608 /* Autodetect first vector size we try. */
3609 machine_mode next_vector_mode
= VOIDmode
;
3610 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
3611 unsigned int mode_i
= 0;
3613 vec_info_shared shared
;
3615 machine_mode autodetected_vector_mode
= VOIDmode
;
3618 bool vectorized
= false;
3620 bb_vinfo
= new _bb_vec_info (region_begin
, region_end
, &shared
);
3622 bool first_time_p
= shared
.datarefs
.is_empty ();
3623 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
3625 bb_vinfo
->shared
->save_datarefs ();
3627 bb_vinfo
->shared
->check_datarefs ();
3628 bb_vinfo
->vector_mode
= next_vector_mode
;
3630 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
)
3631 && dbg_cnt (vect_slp
))
3633 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_NOTE
, vect_location
,
3636 "***** Analysis succeeded with vector mode"
3637 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
3638 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
3641 bb_vinfo
->shared
->check_datarefs ();
3644 slp_instance instance
;
3645 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
3647 if (instance
->subgraph_entries
.is_empty ())
3650 vect_location
= instance
->location ();
3651 if (!unlimited_cost_model (NULL
)
3652 && !vect_bb_vectorization_profitable_p
3653 (bb_vinfo
, instance
->subgraph_entries
))
3655 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3657 "not vectorized: vectorization is not "
3662 if (!vectorized
&& dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE
, vect_location
,
3664 "Basic block will be vectorized "
3668 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
3670 unsigned HOST_WIDE_INT bytes
;
3671 if (dump_enabled_p ())
3674 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
3675 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3676 "basic block part vectorized using %wu "
3677 "byte vectors\n", bytes
);
3679 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3680 "basic block part vectorized using "
3681 "variable length vectors\n");
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE
, vect_location
,
3689 "***** Analysis failed with vector mode %s\n",
3690 GET_MODE_NAME (bb_vinfo
->vector_mode
));
3694 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
3697 while (mode_i
< vector_modes
.length ()
3698 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
3700 if (dump_enabled_p ())
3701 dump_printf_loc (MSG_NOTE
, vect_location
,
3702 "***** The result for vector mode %s would"
3704 GET_MODE_NAME (vector_modes
[mode_i
]));
3710 if (mode_i
< vector_modes
.length ()
3711 && VECTOR_MODE_P (autodetected_vector_mode
)
3712 && (related_vector_mode (vector_modes
[mode_i
],
3713 GET_MODE_INNER (autodetected_vector_mode
))
3714 == autodetected_vector_mode
)
3715 && (related_vector_mode (autodetected_vector_mode
,
3716 GET_MODE_INNER (vector_modes
[mode_i
]))
3717 == vector_modes
[mode_i
]))
3719 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE
, vect_location
,
3721 "***** Skipping vector mode %s, which would"
3722 " repeat the analysis for %s\n",
3723 GET_MODE_NAME (vector_modes
[mode_i
]),
3724 GET_MODE_NAME (autodetected_vector_mode
));
3729 || mode_i
== vector_modes
.length ()
3730 || autodetected_vector_mode
== VOIDmode
3731 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3732 vector sizes will fail do not bother iterating. */
3736 /* Try the next biggest vector size. */
3737 next_vector_mode
= vector_modes
[mode_i
++];
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE
, vect_location
,
3740 "***** Re-trying analysis with vector mode %s\n",
3741 GET_MODE_NAME (next_vector_mode
));
3745 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3746 true if anything in the basic-block was vectorized. */
3749 vect_slp_bb (basic_block bb
)
3751 vec
<data_reference_p
> datarefs
= vNULL
;
3752 vec
<int> dataref_groups
= vNULL
;
3754 int current_group
= 0;
3755 gimple_stmt_iterator region_begin
= gsi_start_nondebug_after_labels_bb (bb
);
3756 gimple_stmt_iterator region_end
= gsi_last_bb (bb
);
3757 if (!gsi_end_p (region_end
))
3758 gsi_next (®ion_end
);
3760 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
3763 gimple
*stmt
= gsi_stmt (gsi
);
3764 if (is_gimple_debug (stmt
))
3769 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
3770 vect_location
= stmt
;
3772 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
3773 &dataref_groups
, current_group
))
3776 if (insns
> param_slp_max_insns_in_bb
)
3778 if (dump_enabled_p ())
3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3780 "not vectorized: too many instructions in "
3785 return vect_slp_region (region_begin
, region_end
, datarefs
,
3786 &dataref_groups
, insns
);
3790 /* Build a variable-length vector in which the elements in ELTS are repeated
3791 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3792 RESULTS and add any new instructions to SEQ.
3794 The approach we use is:
3796 (1) Find a vector mode VM with integer elements of mode IM.
3798 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3799 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3800 from small vectors to IM.
3802 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3804 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3805 correct byte contents.
3807 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3809 We try to find the largest IM for which this sequence works, in order
3810 to cut down on the number of interleaves. */
3813 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
3814 vec
<tree
> elts
, unsigned int nresults
,
3817 unsigned int nelts
= elts
.length ();
3818 tree element_type
= TREE_TYPE (vector_type
);
3820 /* (1) Find a vector mode VM with integer elements of mode IM. */
3821 unsigned int nvectors
= 1;
3822 tree new_vector_type
;
3824 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
3825 &nvectors
, &new_vector_type
,
3829 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3830 unsigned int partial_nelts
= nelts
/ nvectors
;
3831 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
3833 tree_vector_builder partial_elts
;
3834 auto_vec
<tree
, 32> pieces (nvectors
* 2);
3835 pieces
.quick_grow (nvectors
* 2);
3836 for (unsigned int i
= 0; i
< nvectors
; ++i
)
3838 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3839 ELTS' has mode IM. */
3840 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
3841 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
3842 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
3843 tree t
= gimple_build_vector (seq
, &partial_elts
);
3844 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
3845 TREE_TYPE (new_vector_type
), t
);
3847 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3848 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
3851 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3852 correct byte contents.
3854 We need to repeat the following operation log2(nvectors) times:
3856 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3857 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3859 However, if each input repeats every N elements and the VF is
3860 a multiple of N * 2, the HI result is the same as the LO. */
3861 unsigned int in_start
= 0;
3862 unsigned int out_start
= nvectors
;
3863 unsigned int hi_start
= nvectors
/ 2;
3864 /* A bound on the number of outputs needed to produce NRESULTS results
3865 in the final iteration. */
3866 unsigned int noutputs_bound
= nvectors
* nresults
;
3867 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
3869 noutputs_bound
/= 2;
3870 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
3871 for (unsigned int i
= 0; i
< limit
; ++i
)
3874 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
3877 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
3881 tree output
= make_ssa_name (new_vector_type
);
3882 tree input1
= pieces
[in_start
+ (i
/ 2)];
3883 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
3884 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
3887 gimple_seq_add_stmt (seq
, stmt
);
3888 pieces
[out_start
+ i
] = output
;
3890 std::swap (in_start
, out_start
);
3893 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3894 results
.reserve (nresults
);
3895 for (unsigned int i
= 0; i
< nresults
; ++i
)
3897 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
3898 pieces
[in_start
+ i
]));
3900 results
.quick_push (results
[i
- nvectors
]);
3904 /* For constant and loop invariant defs in OP_NODE this function creates
3905 vector defs that will be used in the vectorized stmts and stores them
3906 to SLP_TREE_VEC_DEFS of OP_NODE. */
3909 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
3911 unsigned HOST_WIDE_INT nunits
;
3913 unsigned j
, number_of_places_left_in_vector
;
3916 int group_size
= op_node
->ops
.length ();
3917 unsigned int vec_num
, i
;
3918 unsigned number_of_copies
= 1;
3920 gimple_seq ctor_seq
= NULL
;
3921 auto_vec
<tree
, 16> permute_results
;
3923 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3924 vector_type
= SLP_TREE_VECTYPE (op_node
);
3926 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
3927 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
3928 auto_vec
<tree
> voprnds (number_of_vectors
);
3930 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3931 created vectors. It is greater than 1 if unrolling is performed.
3933 For example, we have two scalar operands, s1 and s2 (e.g., group of
3934 strided accesses of size two), while NUNITS is four (i.e., four scalars
3935 of this type can be packed in a vector). The output vector will contain
3936 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3939 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3940 containing the operands.
3942 For example, NUNITS is four as before, and the group size is 8
3943 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3944 {s5, s6, s7, s8}. */
3946 /* When using duplicate_and_interleave, we just need one element for
3947 each scalar statement. */
3948 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
3949 nunits
= group_size
;
3951 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
3953 number_of_places_left_in_vector
= nunits
;
3955 tree_vector_builder
elts (vector_type
, nunits
, 1);
3956 elts
.quick_grow (nunits
);
3957 stmt_vec_info insert_after
= NULL
;
3958 for (j
= 0; j
< number_of_copies
; j
++)
3961 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
3963 /* Create 'vect_ = {op0,op1,...,opn}'. */
3964 number_of_places_left_in_vector
--;
3966 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
3968 if (CONSTANT_CLASS_P (op
))
3970 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3972 /* Can't use VIEW_CONVERT_EXPR for booleans because
3973 of possibly different sizes of scalar value and
3975 if (integer_zerop (op
))
3976 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
3977 else if (integer_onep (op
))
3978 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
3983 op
= fold_unary (VIEW_CONVERT_EXPR
,
3984 TREE_TYPE (vector_type
), op
);
3985 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
3989 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
3991 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3994 = build_all_ones_cst (TREE_TYPE (vector_type
));
3996 = build_zero_cst (TREE_TYPE (vector_type
));
3997 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
3998 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
4004 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
4007 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
4010 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
4014 elts
[number_of_places_left_in_vector
] = op
;
4015 if (!CONSTANT_CLASS_P (op
))
4017 /* For BB vectorization we have to compute an insert location
4018 when a def is inside the analyzed region since we cannot
4019 simply insert at the BB start in this case. */
4020 stmt_vec_info opdef
;
4021 if (TREE_CODE (orig_op
) == SSA_NAME
4022 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
4023 && is_a
<bb_vec_info
> (vinfo
)
4024 && (opdef
= vinfo
->lookup_def (orig_op
)))
4027 insert_after
= opdef
;
4029 insert_after
= get_later_stmt (insert_after
, opdef
);
4032 if (number_of_places_left_in_vector
== 0)
4035 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
4036 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
4037 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
4040 if (permute_results
.is_empty ())
4041 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
4042 elts
, number_of_vectors
,
4044 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
4046 if (!gimple_seq_empty_p (ctor_seq
))
4050 gimple_stmt_iterator gsi
4051 = gsi_for_stmt (insert_after
->stmt
);
4052 gsi_insert_seq_after (&gsi
, ctor_seq
,
4053 GSI_CONTINUE_LINKING
);
4056 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
4059 voprnds
.quick_push (vec_cst
);
4060 insert_after
= NULL
;
4061 number_of_places_left_in_vector
= nunits
;
4063 elts
.new_vector (vector_type
, nunits
, 1);
4064 elts
.quick_grow (nunits
);
4069 /* Since the vectors are created in the reverse order, we should invert
4071 vec_num
= voprnds
.length ();
4072 for (j
= vec_num
; j
!= 0; j
--)
4074 vop
= voprnds
[j
- 1];
4075 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4078 /* In case that VF is greater than the unrolling factor needed for the SLP
4079 group of stmts, NUMBER_OF_VECTORS to be created is greater than
4080 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4081 to replicate the vectors. */
4082 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
4083 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
4085 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
4088 /* Get the Ith vectorized definition from SLP_NODE. */
4091 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
4093 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
4094 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
4096 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
4099 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
4102 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
4104 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
4105 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
4108 gimple
*vec_def_stmt
;
4109 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
4110 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
4113 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
4116 /* Get N vectorized definitions for SLP_NODE. */
4119 vect_get_slp_defs (vec_info
*,
4120 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
4123 n
= SLP_TREE_CHILDREN (slp_node
).length ();
4125 for (unsigned i
= 0; i
< n
; ++i
)
4127 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
4128 vec
<tree
> vec_defs
= vNULL
;
4129 vect_get_slp_defs (child
, &vec_defs
);
4130 vec_oprnds
->quick_push (vec_defs
);
4134 /* Generate vector permute statements from a list of loads in DR_CHAIN.
4135 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
4136 permute statements for the SLP node NODE. */
4139 vect_transform_slp_perm_load (vec_info
*vinfo
,
4140 slp_tree node
, vec
<tree
> dr_chain
,
4141 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
4142 bool analyze_only
, unsigned *n_perms
)
4144 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4146 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4147 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
4148 unsigned int mask_element
;
4151 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4154 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4156 mode
= TYPE_MODE (vectype
);
4157 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4159 /* Initialize the vect stmts of NODE to properly insert the generated
4162 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
4163 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
4164 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
4166 /* Generate permutation masks for every NODE. Number of masks for each NODE
4167 is equal to GROUP_SIZE.
4168 E.g., we have a group of three nodes with three loads from the same
4169 location in each node, and the vector size is 4. I.e., we have a
4170 a0b0c0a1b1c1... sequence and we need to create the following vectors:
4171 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
4172 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
4175 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
4176 The last mask is illegal since we assume two operands for permute
4177 operation, and the mask element values can't be outside that range.
4178 Hence, the last mask must be converted into {2,5,5,5}.
4179 For the first two permutations we need the first and the second input
4180 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
4181 we need the second and the third vectors: {b1,c1,a2,b2} and
4184 int vect_stmts_counter
= 0;
4185 unsigned int index
= 0;
4186 int first_vec_index
= -1;
4187 int second_vec_index
= -1;
4191 vec_perm_builder mask
;
4192 unsigned int nelts_to_build
;
4193 unsigned int nvectors_per_build
;
4194 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
4195 && multiple_p (nunits
, group_size
));
4198 /* A single vector contains a whole number of copies of the node, so:
4199 (a) all permutes can use the same mask; and
4200 (b) the permutes only need a single vector input. */
4201 mask
.new_vector (nunits
, group_size
, 3);
4202 nelts_to_build
= mask
.encoded_nelts ();
4203 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
4207 /* We need to construct a separate mask for each vector statement. */
4208 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
4209 if (!nunits
.is_constant (&const_nunits
)
4210 || !vf
.is_constant (&const_vf
))
4212 mask
.new_vector (const_nunits
, const_nunits
, 1);
4213 nelts_to_build
= const_vf
* group_size
;
4214 nvectors_per_build
= 1;
4217 unsigned int count
= mask
.encoded_nelts ();
4218 mask
.quick_grow (count
);
4219 vec_perm_indices indices
;
4221 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
4223 unsigned int iter_num
= j
/ group_size
;
4224 unsigned int stmt_num
= j
% group_size
;
4225 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
4226 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
4229 first_vec_index
= 0;
4234 /* Enforced before the loop when !repeating_p. */
4235 unsigned int const_nunits
= nunits
.to_constant ();
4236 vec_index
= i
/ const_nunits
;
4237 mask_element
= i
% const_nunits
;
4238 if (vec_index
== first_vec_index
4239 || first_vec_index
== -1)
4241 first_vec_index
= vec_index
;
4243 else if (vec_index
== second_vec_index
4244 || second_vec_index
== -1)
4246 second_vec_index
= vec_index
;
4247 mask_element
+= const_nunits
;
4251 if (dump_enabled_p ())
4252 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4253 "permutation requires at "
4254 "least three vectors %G",
4256 gcc_assert (analyze_only
);
4260 gcc_assert (mask_element
< 2 * const_nunits
);
4263 if (mask_element
!= index
)
4265 mask
[index
++] = mask_element
;
4267 if (index
== count
&& !noop_p
)
4269 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
4270 if (!can_vec_perm_const_p (mode
, indices
))
4272 if (dump_enabled_p ())
4274 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4276 "unsupported vect permute { ");
4277 for (i
= 0; i
< count
; ++i
)
4279 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4280 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4282 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4284 gcc_assert (analyze_only
);
4295 tree mask_vec
= NULL_TREE
;
4298 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4300 if (second_vec_index
== -1)
4301 second_vec_index
= first_vec_index
;
4303 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
4305 /* Generate the permute statement if necessary. */
4306 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
4307 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
4311 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
4313 = vect_create_destination_var (gimple_assign_lhs (stmt
),
4315 perm_dest
= make_ssa_name (perm_dest
);
4317 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4318 first_vec
, second_vec
,
4320 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
4324 /* If mask was NULL_TREE generate the requested
4325 identity transform. */
4326 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
4328 /* Store the vector statement in NODE. */
4329 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
4334 first_vec_index
= -1;
4335 second_vec_index
= -1;
4344 /* Vectorize the SLP permutations in NODE as specified
4345 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4346 child number and lane number.
4347 Interleaving of two two-lane two-child SLP subtrees (not supported):
4348 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4349 A blend of two four-lane two-child SLP subtrees:
4350 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4351 Highpart of a four-lane one-child SLP subtree (not supported):
4352 [ { 0, 2 }, { 0, 3 } ]
4353 Where currently only a subset is supported by code generating below. */
4356 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
4357 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
4359 tree vectype
= SLP_TREE_VECTYPE (node
);
4361 /* ??? We currently only support all same vector input and output types
4362 while the SLP IL should really do a concat + select and thus accept
4363 arbitrary mismatches. */
4366 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4367 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
4369 if (dump_enabled_p ())
4370 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4371 "Unsupported lane permutation\n");
4375 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
4376 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
4377 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_NOTE
, vect_location
,
4380 "vectorizing permutation");
4381 for (unsigned i
= 0; i
< perm
.length (); ++i
)
4382 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
4383 dump_printf (MSG_NOTE
, "\n");
4386 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4387 if (!nunits
.is_constant ())
4389 unsigned HOST_WIDE_INT vf
= 1;
4390 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4391 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
4393 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
4394 gcc_assert (multiple_p (olanes
, nunits
));
4396 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4397 from the { SLP operand, scalar lane } permutation as recorded in the
4398 SLP node as intermediate step. This part should already work
4399 with SLP children with arbitrary number of lanes. */
4400 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
4401 auto_vec
<unsigned> active_lane
;
4402 vperm
.create (olanes
);
4403 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
4404 for (unsigned i
= 0; i
< vf
; ++i
)
4406 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
4408 std::pair
<unsigned, unsigned> p
= perm
[pi
];
4409 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
4410 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
4411 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
4412 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
4413 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
4415 /* Advance to the next group. */
4416 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
4417 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
4420 if (dump_enabled_p ())
4422 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
4423 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4425 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
4426 dump_printf (MSG_NOTE
, ",");
4427 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
4428 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
4429 vperm
[i
].first
.second
);
4431 dump_printf (MSG_NOTE
, "\n");
4434 /* We can only handle two-vector permutes, everything else should
4435 be lowered on the SLP level. The following is closely inspired
4436 by vect_transform_slp_perm_load and is supposed to eventually
4438 ??? As intermediate step do code-gen in the SLP tree representation
4440 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
4441 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
4442 unsigned int const_nunits
= nunits
.to_constant ();
4443 unsigned int index
= 0;
4444 unsigned int mask_element
;
4445 vec_perm_builder mask
;
4446 mask
.new_vector (const_nunits
, const_nunits
, 1);
4447 unsigned int count
= mask
.encoded_nelts ();
4448 mask
.quick_grow (count
);
4449 vec_perm_indices indices
;
4450 unsigned nperms
= 0;
4451 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4453 mask_element
= vperm
[i
].second
;
4454 if (first_vec
.first
== -1U
4455 || first_vec
== vperm
[i
].first
)
4456 first_vec
= vperm
[i
].first
;
4457 else if (second_vec
.first
== -1U
4458 || second_vec
== vperm
[i
].first
)
4460 second_vec
= vperm
[i
].first
;
4461 mask_element
+= const_nunits
;
4465 if (dump_enabled_p ())
4466 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4467 "permutation requires at "
4468 "least three vectors");
4473 mask
[index
++] = mask_element
;
4477 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
4479 bool identity_p
= indices
.series_p (0, 1, 0, 1);
4481 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
4483 if (dump_enabled_p ())
4485 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4487 "unsupported vect permute { ");
4488 for (i
= 0; i
< count
; ++i
)
4490 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4491 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4493 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4503 if (second_vec
.first
== -1U)
4504 second_vec
= first_vec
;
4506 /* Generate the permute statement if necessary. */
4507 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
4509 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
4511 tree perm_dest
= make_ssa_name (vectype
);
4514 slp_tree second_node
4515 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
4517 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
4518 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4519 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4520 first_def
, second_def
,
4524 /* We need a copy here in case the def was external. */
4525 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
4526 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
4527 /* Store the vector statement in NODE. */
4528 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
4532 first_vec
= std::make_pair (-1U, -1U);
4533 second_vec
= std::make_pair (-1U, -1U);
4538 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
4543 /* Vectorize SLP instance tree in postorder. */
4546 vect_schedule_slp_instance (vec_info
*vinfo
,
4547 slp_tree node
, slp_instance instance
)
4549 gimple_stmt_iterator si
;
4553 /* See if we have already vectorized the node in the graph of the
4555 if ((SLP_TREE_DEF_TYPE (node
) == vect_internal_def
4556 && SLP_TREE_VEC_STMTS (node
).exists ())
4557 || SLP_TREE_VEC_DEFS (node
).exists ())
4560 /* Vectorize externals and constants. */
4561 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4562 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4564 /* ??? vectorizable_shift can end up using a scalar operand which is
4565 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4566 node in this case. */
4567 if (!SLP_TREE_VECTYPE (node
))
4570 vect_create_constant_vectors (vinfo
, node
);
4574 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4575 vect_schedule_slp_instance (vinfo
, child
, instance
);
4577 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
4578 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4580 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4581 if (dump_enabled_p ())
4582 dump_printf_loc (MSG_NOTE
, vect_location
,
4583 "------>vectorizing SLP node starting from: %G",
4586 if (STMT_VINFO_DATA_REF (stmt_info
)
4587 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
4589 /* Vectorized loads go before the first scalar load to make it
4590 ready early, vectorized stores go before the last scalar
4591 stmt which is where all uses are ready. */
4592 stmt_vec_info last_stmt_info
= NULL
;
4593 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
4594 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
4595 else /* DR_IS_WRITE */
4596 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
4597 si
= gsi_for_stmt (last_stmt_info
->stmt
);
4599 else if (SLP_TREE_CHILDREN (node
).is_empty ())
4601 /* This happens for reduction and induction PHIs where we do not use the
4602 insertion iterator. */
4603 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4604 == cycle_phi_info_type
4605 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
4606 == induc_vec_info_type
));
4611 /* Emit other stmts after the children vectorized defs which is
4612 earliest possible. */
4613 gimple
*last_stmt
= NULL
;
4614 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4615 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4617 /* For fold-left reductions we are retaining the scalar
4618 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4619 set so the representation isn't perfect. Resort to the
4620 last scalar def here. */
4621 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
4623 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
4624 == cycle_phi_info_type
);
4625 gphi
*phi
= as_a
<gphi
*>
4626 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
4628 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
4631 /* We are emitting all vectorized stmts in the same place and
4632 the last one is the last.
4633 ??? Unless we have a load permutation applied and that
4634 figures to re-use an earlier generated load. */
4637 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
4639 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4642 else if (!SLP_TREE_VECTYPE (child
))
4644 /* For externals we use unvectorized at all scalar defs. */
4647 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
4648 if (TREE_CODE (def
) == SSA_NAME
4649 && !SSA_NAME_IS_DEFAULT_DEF (def
))
4651 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
4653 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
4659 /* For externals we have to look at all defs since their
4660 insertion place is decided per vector. But beware
4661 of pre-existing vectors where we need to make sure
4662 we do not insert before the region boundary. */
4663 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
4664 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
4665 last_stmt
= gsi_stmt (as_a
<bb_vec_info
> (vinfo
)->region_begin
);
4670 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
4671 if (TREE_CODE (vdef
) == SSA_NAME
4672 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
4674 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
4676 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4681 /* This can happen when all children are pre-existing vectors or
4684 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
4685 if (is_a
<gphi
*> (last_stmt
))
4686 si
= gsi_after_labels (gimple_bb (last_stmt
));
4689 si
= gsi_for_stmt (last_stmt
);
4694 bool done_p
= false;
4696 /* Handle purely internal nodes. */
4697 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4699 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4700 be shared with different SLP nodes (but usually it's the same
4701 operation apart from the case the stmt is only there for denoting
4702 the actual scalar lane defs ...). So do not call vect_transform_stmt
4703 but open-code it here (partly). */
4704 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
4709 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
4712 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4713 For loop vectorization this is done in vectorizable_call, but for SLP
4714 it needs to be deferred until end of vect_schedule_slp, because multiple
4715 SLP instances may refer to the same scalar stmt. */
4718 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
4719 slp_tree node
, hash_set
<slp_tree
> &visited
)
4722 gimple_stmt_iterator gsi
;
4726 stmt_vec_info stmt_info
;
4728 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
4731 if (visited
.add (node
))
4734 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4735 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
4737 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4739 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4740 if (!stmt
|| gimple_bb (stmt
) == NULL
)
4742 if (is_pattern_stmt_p (stmt_info
)
4743 || !PURE_SLP_STMT (stmt_info
))
4745 lhs
= gimple_call_lhs (stmt
);
4746 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
4747 gsi
= gsi_for_stmt (stmt
);
4748 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
4749 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
4754 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
4756 hash_set
<slp_tree
> visited
;
4757 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
4760 /* Vectorize the instance root. */
4763 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
4765 gassign
*rstmt
= NULL
;
4767 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
4772 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4774 tree vect_lhs
= gimple_get_lhs (child_stmt
);
4775 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4776 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
4777 TREE_TYPE (vect_lhs
)))
4778 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
4780 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
4784 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
4786 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
4789 vec
<constructor_elt
, va_gc
> *v
;
4790 vec_alloc (v
, nelts
);
4792 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4794 CONSTRUCTOR_APPEND_ELT (v
,
4796 gimple_get_lhs (child_stmt
));
4798 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4799 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
4800 tree r_constructor
= build_constructor (rtype
, v
);
4801 rstmt
= gimple_build_assign (lhs
, r_constructor
);
4806 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
4807 gsi_replace (&rgsi
, rstmt
, true);
4810 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
4813 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
4815 slp_instance instance
;
4818 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4820 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4821 if (dump_enabled_p ())
4823 dump_printf_loc (MSG_NOTE
, vect_location
,
4824 "Vectorizing SLP tree:\n");
4825 if (SLP_INSTANCE_ROOT_STMT (instance
))
4826 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
4827 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
4828 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4829 SLP_INSTANCE_TREE (instance
));
4831 /* Schedule the tree of INSTANCE. */
4832 vect_schedule_slp_instance (vinfo
, node
, instance
);
4834 if (SLP_INSTANCE_ROOT_STMT (instance
))
4835 vectorize_slp_instance_root_stmt (node
, instance
);
4837 if (dump_enabled_p ())
4838 dump_printf_loc (MSG_NOTE
, vect_location
,
4839 "vectorizing stmts using SLP.\n");
4842 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4844 slp_tree root
= SLP_INSTANCE_TREE (instance
);
4845 stmt_vec_info store_info
;
4848 /* For reductions set the latch values of the vectorized PHIs. */
4849 if (instance
->reduc_phis
4850 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4851 (instance
->reduc_phis
)) != FOLD_LEFT_REDUCTION
4852 && STMT_VINFO_REDUC_TYPE (SLP_TREE_REPRESENTATIVE
4853 (instance
->reduc_phis
)) != EXTRACT_LAST_REDUCTION
)
4855 slp_tree slp_node
= root
;
4856 slp_tree phi_node
= instance
->reduc_phis
;
4857 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_SCALAR_STMTS (phi_node
)[0]->stmt
);
4858 edge e
= loop_latch_edge (gimple_bb (phi
)->loop_father
);
4859 gcc_assert (SLP_TREE_VEC_STMTS (phi_node
).length ()
4860 == SLP_TREE_VEC_STMTS (slp_node
).length ());
4861 for (unsigned j
= 0; j
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++j
)
4862 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[j
]),
4863 vect_get_slp_vect_def (slp_node
, j
),
4864 e
, gimple_phi_arg_location (phi
, e
->dest_idx
));
4867 /* Remove scalar call stmts. Do not do this for basic-block
4868 vectorization as not all uses may be vectorized.
4869 ??? Why should this be necessary? DCE should be able to
4870 remove the stmts itself.
4871 ??? For BB vectorization we can as well remove scalar
4872 stmts starting from the SLP tree root if they have no
4874 if (is_a
<loop_vec_info
> (vinfo
))
4875 vect_remove_slp_scalar_calls (vinfo
, root
);
4877 /* Remove vectorized stores original scalar stmts. */
4878 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
4880 if (!STMT_VINFO_DATA_REF (store_info
)
4881 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
4884 store_info
= vect_orig_stmt (store_info
);
4885 /* Free the attached stmt_vec_info and remove the stmt. */
4886 vinfo
->remove_stmt (store_info
);