1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
50 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
51 slp_tree
, stmt_vector_for_cost
*);
53 /* Initialize a SLP node. */
55 _slp_tree::_slp_tree ()
57 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
58 SLP_TREE_SCALAR_OPS (this) = vNULL
;
59 SLP_TREE_VEC_STMTS (this) = vNULL
;
60 SLP_TREE_VEC_DEFS (this) = vNULL
;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL
;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
66 SLP_TREE_CODE (this) = ERROR_MARK
;
67 SLP_TREE_VECTYPE (this) = NULL_TREE
;
68 SLP_TREE_REPRESENTATIVE (this) = NULL
;
74 /* Tear down a SLP node. */
76 _slp_tree::~_slp_tree ()
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
92 vect_free_slp_tree (slp_tree node
, bool final_p
)
97 if (--node
->refcnt
!= 0)
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
101 vect_free_slp_tree (child
, final_p
);
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
109 stmt_vec_info stmt_info
;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info
) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info
)--;
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
125 vect_free_slp_instance (slp_instance instance
, bool final_p
)
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
), final_p
);
128 SLP_INSTANCE_LOADS (instance
).release ();
133 /* Create an SLP node for SCALAR_STMTS. */
136 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
138 slp_tree node
= new _slp_tree
;
139 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
140 SLP_TREE_CHILDREN (node
).create (nops
);
141 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
142 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
143 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
146 stmt_vec_info stmt_info
;
147 FOR_EACH_VEC_ELT (scalar_stmts
, i
, stmt_info
)
148 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
153 /* Create an SLP node for OPS. */
156 vect_create_new_slp_node (vec
<tree
> ops
)
158 slp_tree node
= new _slp_tree
;
159 SLP_TREE_SCALAR_OPS (node
) = ops
;
160 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
161 SLP_TREE_LANES (node
) = ops
.length ();
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
169 typedef struct _slp_oprnd_info
171 /* Def-stmts for the operands. */
172 vec
<stmt_vec_info
> def_stmts
;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
179 enum vect_def_type first_dt
;
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
186 static vec
<slp_oprnd_info
>
187 vect_create_oprnd_info (int nops
, int group_size
)
190 slp_oprnd_info oprnd_info
;
191 vec
<slp_oprnd_info
> oprnds_info
;
193 oprnds_info
.create (nops
);
194 for (i
= 0; i
< nops
; i
++)
196 oprnd_info
= XNEW (struct _slp_oprnd_info
);
197 oprnd_info
->def_stmts
.create (group_size
);
198 oprnd_info
->ops
.create (group_size
);
199 oprnd_info
->first_dt
= vect_uninitialized_def
;
200 oprnd_info
->first_op_type
= NULL_TREE
;
201 oprnd_info
->any_pattern
= false;
202 oprnds_info
.quick_push (oprnd_info
);
209 /* Free operands info. */
212 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
215 slp_oprnd_info oprnd_info
;
217 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
219 oprnd_info
->def_stmts
.release ();
220 oprnd_info
->ops
.release ();
221 XDELETE (oprnd_info
);
224 oprnds_info
.release ();
228 /* Return true if STMTS contains a pattern statement. */
231 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
233 stmt_vec_info stmt_info
;
235 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
236 if (is_pattern_stmt_p (stmt_info
))
241 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
242 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
246 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
247 stmt_vec_info first_stmt_info
)
249 stmt_vec_info next_stmt_info
= first_stmt_info
;
252 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
257 if (next_stmt_info
== stmt_info
)
259 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
261 result
+= DR_GROUP_GAP (next_stmt_info
);
263 while (next_stmt_info
);
268 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
269 using the method implemented by duplicate_and_interleave. Return true
270 if so, returning the number of intermediate vectors in *NVECTORS_OUT
271 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
275 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
276 tree elt_type
, unsigned int *nvectors_out
,
277 tree
*vector_type_out
,
280 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
281 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
284 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
285 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
286 unsigned int nvectors
= 1;
289 scalar_int_mode int_mode
;
290 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
291 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
293 /* Get the natural vector type for this SLP group size. */
294 tree int_type
= build_nonstandard_integer_type
295 (GET_MODE_BITSIZE (int_mode
), 1);
297 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
299 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
300 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
301 GET_MODE_SIZE (base_vector_mode
)))
303 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
304 together into elements of type INT_TYPE and using the result
305 to build NVECTORS vectors. */
306 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
307 vec_perm_builder
sel1 (nelts
, 2, 3);
308 vec_perm_builder
sel2 (nelts
, 2, 3);
309 poly_int64 half_nelts
= exact_div (nelts
, 2);
310 for (unsigned int i
= 0; i
< 3; ++i
)
313 sel1
.quick_push (i
+ nelts
);
314 sel2
.quick_push (half_nelts
+ i
);
315 sel2
.quick_push (half_nelts
+ i
+ nelts
);
317 vec_perm_indices
indices1 (sel1
, 2, nelts
);
318 vec_perm_indices
indices2 (sel2
, 2, nelts
);
319 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
320 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
323 *nvectors_out
= nvectors
;
325 *vector_type_out
= vector_type
;
328 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
330 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
337 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
343 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
344 they are of a valid type and that they match the defs of the first stmt of
345 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
346 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
347 indicates swap is required for cond_expr stmts. Specifically, *SWAP
348 is 1 if STMT is cond and operands of comparison need to be swapped;
349 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
350 If there is any operand swap in this function, *SWAP is set to non-zero
352 If there was a fatal error return -1; if the error could be corrected by
353 swapping operands of father node of this one, return 1; if everything is
356 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char *swap
,
357 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
358 vec
<slp_oprnd_info
> *oprnds_info
)
360 stmt_vec_info stmt_info
= stmts
[stmt_num
];
362 unsigned int i
, number_of_oprnds
;
363 enum vect_def_type dt
= vect_uninitialized_def
;
364 slp_oprnd_info oprnd_info
;
365 int first_op_idx
= 1;
366 unsigned int commutative_op
= -1U;
367 bool first_op_cond
= false;
368 bool first
= stmt_num
== 0;
370 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
372 number_of_oprnds
= gimple_call_num_args (stmt
);
374 if (gimple_call_internal_p (stmt
))
376 internal_fn ifn
= gimple_call_internal_fn (stmt
);
377 commutative_op
= first_commutative_argument (ifn
);
379 /* Masked load, only look at mask. */
380 if (ifn
== IFN_MASK_LOAD
)
382 number_of_oprnds
= 1;
383 /* Mask operand index. */
388 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
390 enum tree_code code
= gimple_assign_rhs_code (stmt
);
391 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
392 /* Swap can only be done for cond_expr if asked to, otherwise we
393 could result in different comparison code to the first stmt. */
394 if (code
== COND_EXPR
395 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
397 first_op_cond
= true;
401 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
406 bool swapped
= (*swap
!= 0);
407 gcc_assert (!swapped
|| first_op_cond
);
408 for (i
= 0; i
< number_of_oprnds
; i
++)
413 /* Map indicating how operands of cond_expr should be swapped. */
414 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
415 int *map
= maps
[*swap
];
418 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
419 first_op_idx
), map
[i
]);
421 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
424 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
425 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
426 oprnd
= TREE_OPERAND (oprnd
, 0);
428 oprnd_info
= (*oprnds_info
)[i
];
430 stmt_vec_info def_stmt_info
;
431 if (!vect_is_simple_use (oprnd
, vinfo
, &dt
, &def_stmt_info
))
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
435 "Build SLP failed: can't analyze def for %T\n",
441 if (def_stmt_info
&& is_pattern_stmt_p (def_stmt_info
))
442 oprnd_info
->any_pattern
= true;
446 /* For the swapping logic below force vect_reduction_def
447 for the reduction op in a SLP reduction group. */
448 if (!STMT_VINFO_DATA_REF (stmt_info
)
449 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
450 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
452 dt
= vect_reduction_def
;
453 oprnd_info
->first_dt
= dt
;
454 oprnd_info
->first_op_type
= TREE_TYPE (oprnd
);
458 /* Not first stmt of the group, check that the def-stmt/s match
459 the def-stmt/s of the first stmt. Allow different definition
460 types for reduction chains: the first stmt must be a
461 vect_reduction_def (a phi node), and the rest
462 end in the reduction chain. */
463 tree type
= TREE_TYPE (oprnd
);
464 if ((oprnd_info
->first_dt
!= dt
465 && !(oprnd_info
->first_dt
== vect_reduction_def
466 && !STMT_VINFO_DATA_REF (stmt_info
)
467 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
469 && !STMT_VINFO_DATA_REF (def_stmt_info
)
470 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
471 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
472 && !((oprnd_info
->first_dt
== vect_external_def
473 || oprnd_info
->first_dt
== vect_constant_def
)
474 && (dt
== vect_external_def
475 || dt
== vect_constant_def
)))
476 || !types_compatible_p (oprnd_info
->first_op_type
, type
)
477 || (!STMT_VINFO_DATA_REF (stmt_info
)
478 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
480 || STMT_VINFO_DATA_REF (def_stmt_info
)
481 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
482 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
483 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
485 /* Try swapping operands if we got a mismatch. */
486 if (i
== commutative_op
&& !swapped
)
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE
, vect_location
,
490 "trying swapped operands\n");
495 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
497 "Build SLP failed: different types\n");
501 if ((dt
== vect_constant_def
502 || dt
== vect_external_def
)
503 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
504 && (TREE_CODE (type
) == BOOLEAN_TYPE
505 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
508 if (dump_enabled_p ())
509 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
510 "Build SLP failed: invalid type of def "
511 "for variable-length SLP %T\n", oprnd
);
516 /* Check the types of the definitions. */
519 case vect_external_def
:
520 /* Make sure to demote the overall operand to external. */
521 oprnd_info
->first_dt
= vect_external_def
;
523 case vect_constant_def
:
524 oprnd_info
->def_stmts
.quick_push (NULL
);
525 oprnd_info
->ops
.quick_push (oprnd
);
528 case vect_internal_def
:
529 case vect_reduction_def
:
530 if (oprnd_info
->first_dt
== vect_reduction_def
531 && !STMT_VINFO_DATA_REF (stmt_info
)
532 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
533 && !STMT_VINFO_DATA_REF (def_stmt_info
)
534 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
535 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
537 /* For a SLP reduction chain we want to duplicate the
538 reduction to each of the chain members. That gets
539 us a sane SLP graph (still the stmts are not 100%
540 correct wrt the initial values). */
542 oprnd_info
->def_stmts
.quick_push (oprnd_info
->def_stmts
[0]);
543 oprnd_info
->ops
.quick_push (oprnd_info
->ops
[0]);
547 case vect_induction_def
:
548 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
549 oprnd_info
->ops
.quick_push (oprnd
);
553 /* FORNOW: Not supported. */
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
556 "Build SLP failed: illegal type of def %T\n",
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE
, vect_location
,
568 "swapped operands to match def types in %G",
576 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
577 Return true if we can, meaning that this choice doesn't conflict with
578 existing SLP nodes that use STMT_INFO. */
581 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
583 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
584 if (old_vectype
&& useless_type_conversion_p (vectype
, old_vectype
))
587 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
588 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
590 /* We maintain the invariant that if any statement in the group is
591 used, all other members of the group have the same vector type. */
592 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
593 stmt_vec_info member_info
= first_info
;
594 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
595 if (STMT_VINFO_NUM_SLP_USES (member_info
) > 0
596 || is_pattern_stmt_p (member_info
))
601 for (member_info
= first_info
; member_info
;
602 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
603 STMT_VINFO_VECTYPE (member_info
) = vectype
;
607 else if (STMT_VINFO_NUM_SLP_USES (stmt_info
) == 0
608 && !is_pattern_stmt_p (stmt_info
))
610 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
614 if (dump_enabled_p ())
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
617 "Build SLP failed: incompatible vector"
618 " types for: %G", stmt_info
->stmt
);
619 dump_printf_loc (MSG_NOTE
, vect_location
,
620 " old vector type: %T\n", old_vectype
);
621 dump_printf_loc (MSG_NOTE
, vect_location
,
622 " new vector type: %T\n", vectype
);
627 /* Return true if call statements CALL1 and CALL2 are similar enough
628 to be combined into the same SLP group. */
631 compatible_calls_p (gcall
*call1
, gcall
*call2
)
633 unsigned int nargs
= gimple_call_num_args (call1
);
634 if (nargs
!= gimple_call_num_args (call2
))
637 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
640 if (gimple_call_internal_p (call1
))
642 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
643 TREE_TYPE (gimple_call_lhs (call2
))))
645 for (unsigned int i
= 0; i
< nargs
; ++i
)
646 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
647 TREE_TYPE (gimple_call_arg (call2
, i
))))
652 if (!operand_equal_p (gimple_call_fn (call1
),
653 gimple_call_fn (call2
), 0))
656 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
662 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
663 caller's attempt to find the vector type in STMT_INFO with the narrowest
664 element type. Return true if VECTYPE is nonnull and if it is valid
665 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
666 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
667 vect_build_slp_tree. */
670 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
671 unsigned int group_size
,
672 tree vectype
, poly_uint64
*max_nunits
)
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
678 "Build SLP failed: unsupported data-type in %G\n",
680 /* Fatal mismatch. */
684 /* If populating the vector type requires unrolling then fail
685 before adjusting *max_nunits for basic-block vectorization. */
686 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
687 unsigned HOST_WIDE_INT const_nunits
;
688 if (is_a
<bb_vec_info
> (vinfo
)
689 && (!nunits
.is_constant (&const_nunits
)
690 || const_nunits
> group_size
))
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
694 "Build SLP failed: unrolling required "
695 "in basic block SLP\n");
696 /* Fatal mismatch. */
700 /* In case of multiple types we need to detect the smallest type. */
701 vect_update_max_nunits (max_nunits
, vectype
);
705 /* Verify if the scalar stmts STMTS are isomorphic, require data
706 permutation or are of unsupported types of operation. Return
707 true if they are, otherwise return false and indicate in *MATCHES
708 which stmts are not isomorphic to the first one. If MATCHES[0]
709 is false then this indicates the comparison could not be
710 carried out or the stmts will never be vectorized by SLP.
712 Note COND_EXPR is possibly isomorphic to another one after swapping its
713 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
714 the first stmt by swapping the two operands of comparison; set SWAP[i]
715 to 2 if stmt I is isormorphic to the first stmt by inverting the code
716 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
717 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
720 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
721 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
722 poly_uint64
*max_nunits
, bool *matches
,
723 bool *two_operators
, tree
*node_vectype
)
726 stmt_vec_info first_stmt_info
= stmts
[0];
727 enum tree_code first_stmt_code
= ERROR_MARK
;
728 enum tree_code alt_stmt_code
= ERROR_MARK
;
729 enum tree_code rhs_code
= ERROR_MARK
;
730 enum tree_code first_cond_code
= ERROR_MARK
;
732 bool need_same_oprnds
= false;
733 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
736 machine_mode optab_op2_mode
;
737 machine_mode vec_mode
;
738 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
741 /* For every stmt in NODE find its def stmt/s. */
742 stmt_vec_info stmt_info
;
743 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
745 gimple
*stmt
= stmt_info
->stmt
;
749 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
752 /* Fail to vectorize statements marked as unvectorizable. */
753 if (!STMT_VINFO_VECTORIZABLE (stmt_info
))
755 if (dump_enabled_p ())
756 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
757 "Build SLP failed: unvectorizable statement %G",
759 /* Fatal mismatch. */
764 lhs
= gimple_get_lhs (stmt
);
765 if (lhs
== NULL_TREE
)
767 if (dump_enabled_p ())
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
769 "Build SLP failed: not GIMPLE_ASSIGN nor "
770 "GIMPLE_CALL %G", stmt
);
771 /* Fatal mismatch. */
777 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
778 &nunits_vectype
, group_size
)
780 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
781 nunits_vectype
, max_nunits
)))
783 /* Fatal mismatch. */
788 gcc_assert (vectype
);
790 if (is_a
<bb_vec_info
> (vinfo
)
791 && !vect_update_shared_vectype (stmt_info
, vectype
))
794 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
797 rhs_code
= CALL_EXPR
;
799 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
801 else if ((gimple_call_internal_p (call_stmt
)
802 && (!vectorizable_internal_fn_p
803 (gimple_call_internal_fn (call_stmt
))))
804 || gimple_call_tail_p (call_stmt
)
805 || gimple_call_noreturn_p (call_stmt
)
806 || !gimple_call_nothrow_p (call_stmt
)
807 || gimple_call_chain (call_stmt
))
809 if (dump_enabled_p ())
810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
811 "Build SLP failed: unsupported call type %G",
813 /* Fatal mismatch. */
820 rhs_code
= gimple_assign_rhs_code (stmt
);
821 load_p
= TREE_CODE_CLASS (rhs_code
) == tcc_reference
;
824 /* Check the operation. */
827 *node_vectype
= vectype
;
828 first_stmt_code
= rhs_code
;
830 /* Shift arguments should be equal in all the packed stmts for a
831 vector shift with scalar shift operand. */
832 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
833 || rhs_code
== LROTATE_EXPR
834 || rhs_code
== RROTATE_EXPR
)
836 vec_mode
= TYPE_MODE (vectype
);
838 /* First see if we have a vector/vector shift. */
839 optab
= optab_for_tree_code (rhs_code
, vectype
,
843 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
845 /* No vector/vector shift, try for a vector/scalar shift. */
846 optab
= optab_for_tree_code (rhs_code
, vectype
,
851 if (dump_enabled_p ())
852 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
853 "Build SLP failed: no optab.\n");
854 /* Fatal mismatch. */
858 icode
= (int) optab_handler (optab
, vec_mode
);
859 if (icode
== CODE_FOR_nothing
)
861 if (dump_enabled_p ())
862 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
864 "op not supported by target.\n");
865 /* Fatal mismatch. */
869 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
870 if (!VECTOR_MODE_P (optab_op2_mode
))
872 need_same_oprnds
= true;
873 first_op1
= gimple_assign_rhs2 (stmt
);
877 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
879 need_same_oprnds
= true;
880 first_op1
= gimple_assign_rhs2 (stmt
);
883 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
885 need_same_oprnds
= true;
886 first_op1
= gimple_call_arg (call_stmt
, 1);
891 if (first_stmt_code
!= rhs_code
892 && alt_stmt_code
== ERROR_MARK
)
893 alt_stmt_code
= rhs_code
;
894 if (first_stmt_code
!= rhs_code
895 && (first_stmt_code
!= IMAGPART_EXPR
896 || rhs_code
!= REALPART_EXPR
)
897 && (first_stmt_code
!= REALPART_EXPR
898 || rhs_code
!= IMAGPART_EXPR
)
899 /* Handle mismatches in plus/minus by computing both
900 and merging the results. */
901 && !((first_stmt_code
== PLUS_EXPR
902 || first_stmt_code
== MINUS_EXPR
)
903 && (alt_stmt_code
== PLUS_EXPR
904 || alt_stmt_code
== MINUS_EXPR
)
905 && rhs_code
== alt_stmt_code
)
906 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
907 && (first_stmt_code
== ARRAY_REF
908 || first_stmt_code
== BIT_FIELD_REF
909 || first_stmt_code
== INDIRECT_REF
910 || first_stmt_code
== COMPONENT_REF
911 || first_stmt_code
== MEM_REF
)))
913 if (dump_enabled_p ())
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
916 "Build SLP failed: different operation "
918 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
919 "original stmt %G", first_stmt_info
->stmt
);
925 if (need_same_oprnds
)
927 tree other_op1
= (call_stmt
928 ? gimple_call_arg (call_stmt
, 1)
929 : gimple_assign_rhs2 (stmt
));
930 if (!operand_equal_p (first_op1
, other_op1
, 0))
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
934 "Build SLP failed: different shift "
935 "arguments in %G", stmt
);
941 if (!load_p
&& rhs_code
== CALL_EXPR
)
943 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
944 as_a
<gcall
*> (stmt
)))
946 if (dump_enabled_p ())
947 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
948 "Build SLP failed: different calls in %G",
956 /* Grouped store or load. */
957 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
959 if (REFERENCE_CLASS_P (lhs
))
967 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
970 /* Check that there are no loads from different interleaving
971 chains in the same node. */
972 if (prev_first_load
!= first_load
)
974 if (dump_enabled_p ())
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
977 "Build SLP failed: different "
978 "interleaving chains in one node %G",
985 prev_first_load
= first_load
;
987 } /* Grouped access. */
992 /* Not grouped load. */
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
995 "Build SLP failed: not grouped load %G", stmt
);
997 /* FORNOW: Not grouped loads are not supported. */
998 /* Fatal mismatch. */
1003 /* Not memory operation. */
1004 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
1005 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1006 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1007 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1008 && rhs_code
!= VIEW_CONVERT_EXPR
1009 && rhs_code
!= CALL_EXPR
)
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1013 "Build SLP failed: operation unsupported %G",
1015 /* Fatal mismatch. */
1020 if (rhs_code
== COND_EXPR
)
1022 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1023 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1024 enum tree_code swap_code
= ERROR_MARK
;
1025 enum tree_code invert_code
= ERROR_MARK
;
1028 first_cond_code
= TREE_CODE (cond_expr
);
1029 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1031 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1032 swap_code
= swap_tree_comparison (cond_code
);
1033 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1036 if (first_cond_code
== cond_code
)
1038 /* Isomorphic can be achieved by swapping. */
1039 else if (first_cond_code
== swap_code
)
1041 /* Isomorphic can be achieved by inverting. */
1042 else if (first_cond_code
== invert_code
)
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1048 "Build SLP failed: different"
1049 " operation %G", stmt
);
1059 for (i
= 0; i
< group_size
; ++i
)
1063 /* If we allowed a two-operation SLP node verify the target can cope
1064 with the permute we are going to use. */
1065 if (alt_stmt_code
!= ERROR_MARK
1066 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1068 *two_operators
= true;
1074 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1075 Note we never remove apart from at destruction time so we do not
1076 need a special value for deleted that differs from empty. */
1079 typedef vec
<stmt_vec_info
> value_type
;
1080 typedef vec
<stmt_vec_info
> compare_type
;
1081 static inline hashval_t
hash (value_type
);
1082 static inline bool equal (value_type existing
, value_type candidate
);
1083 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1084 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1085 static const bool empty_zero_p
= true;
1086 static inline void mark_empty (value_type
&x
) { x
.release (); }
1087 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1088 static inline void remove (value_type
&x
) { x
.release (); }
1091 bst_traits::hash (value_type x
)
1094 for (unsigned i
= 0; i
< x
.length (); ++i
)
1095 h
.add_int (gimple_uid (x
[i
]->stmt
));
1099 bst_traits::equal (value_type existing
, value_type candidate
)
1101 if (existing
.length () != candidate
.length ())
1103 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1104 if (existing
[i
] != candidate
[i
])
1109 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1110 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1111 scalar_stmts_to_slp_tree_map_t
;
1114 vect_build_slp_tree_2 (vec_info
*vinfo
,
1115 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1116 poly_uint64
*max_nunits
,
1117 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1118 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1121 vect_build_slp_tree (vec_info
*vinfo
,
1122 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1123 poly_uint64
*max_nunits
,
1124 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1125 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1127 if (slp_tree
*leader
= bst_map
->get (stmts
))
1129 if (dump_enabled_p ())
1130 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1131 *leader
? "" : "failed ", *leader
);
1134 (*leader
)->refcnt
++;
1135 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1139 poly_uint64 this_max_nunits
= 1;
1140 slp_tree res
= vect_build_slp_tree_2 (vinfo
, stmts
, group_size
,
1142 matches
, npermutes
, tree_size
, bst_map
);
1145 res
->max_nunits
= this_max_nunits
;
1146 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1147 /* Keep a reference for the bst_map use. */
1150 bst_map
->put (stmts
.copy (), res
);
1154 /* Recursively build an SLP tree starting from NODE.
1155 Fail (and return a value not equal to zero) if def-stmts are not
1156 isomorphic, require data permutation or are of unsupported types of
1157 operation. Otherwise, return 0.
1158 The value returned is the depth in the SLP tree where a mismatch
1162 vect_build_slp_tree_2 (vec_info
*vinfo
,
1163 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1164 poly_uint64
*max_nunits
,
1165 bool *matches
, unsigned *npermutes
, unsigned *tree_size
,
1166 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1168 unsigned nops
, i
, this_tree_size
= 0;
1169 poly_uint64 this_max_nunits
= *max_nunits
;
1174 stmt_vec_info stmt_info
= stmts
[0];
1175 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1176 nops
= gimple_call_num_args (stmt
);
1177 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1179 nops
= gimple_num_ops (stmt
) - 1;
1180 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1183 else if (is_a
<gphi
*> (stmt_info
->stmt
))
1188 /* If the SLP node is a PHI (induction or reduction), terminate
1190 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1192 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1193 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
1194 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1198 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1199 /* Induction from different IVs is not supported. */
1200 if (def_type
== vect_induction_def
)
1202 stmt_vec_info other_info
;
1203 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1204 if (stmt_info
!= other_info
)
1207 else if (def_type
== vect_reduction_def
1208 || def_type
== vect_double_reduction_def
1209 || def_type
== vect_nested_cycle
)
1211 /* Else def types have to match. */
1212 stmt_vec_info other_info
;
1213 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1214 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1220 node
= vect_create_new_slp_node (stmts
, 0);
1221 SLP_TREE_VECTYPE (node
) = vectype
;
1226 bool two_operators
= false;
1227 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1228 tree vectype
= NULL_TREE
;
1229 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1230 &this_max_nunits
, matches
, &two_operators
,
1234 /* If the SLP node is a load, terminate the recursion unless masked. */
1235 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1236 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1238 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1241 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1246 *max_nunits
= this_max_nunits
;
1248 node
= vect_create_new_slp_node (stmts
, 0);
1249 SLP_TREE_VECTYPE (node
) = vectype
;
1250 /* And compute the load permutation. Whether it is actually
1251 a permutation depends on the unrolling factor which is
1253 vec
<unsigned> load_permutation
;
1255 stmt_vec_info load_info
;
1256 load_permutation
.create (group_size
);
1257 stmt_vec_info first_stmt_info
1258 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1259 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1261 int load_place
= vect_get_place_in_interleaving_chain
1262 (load_info
, first_stmt_info
);
1263 gcc_assert (load_place
!= -1);
1264 load_permutation
.safe_push (load_place
);
1266 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1271 /* Get at the operands, verifying they are compatible. */
1272 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1273 slp_oprnd_info oprnd_info
;
1274 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1276 int res
= vect_get_and_check_slp_defs (vinfo
, &swap
[i
],
1277 stmts
, i
, &oprnds_info
);
1279 matches
[(res
== -1) ? 0 : i
] = false;
1283 for (i
= 0; i
< group_size
; ++i
)
1286 vect_free_oprnd_info (oprnds_info
);
1290 auto_vec
<slp_tree
, 4> children
;
1292 stmt_info
= stmts
[0];
1294 /* Create SLP_TREE nodes for the definition node/s. */
1295 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1300 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1302 /* COND_EXPR have one too many eventually if the condition
1304 gcc_assert (i
== 3 && nops
== 4);
1308 if (oprnd_info
->first_dt
!= vect_internal_def
1309 && oprnd_info
->first_dt
!= vect_reduction_def
1310 && oprnd_info
->first_dt
!= vect_induction_def
)
1312 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1313 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1314 oprnd_info
->ops
= vNULL
;
1315 children
.safe_push (invnode
);
1319 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1320 group_size
, &this_max_nunits
,
1322 &this_tree_size
, bst_map
)) != NULL
)
1324 oprnd_info
->def_stmts
= vNULL
;
1325 children
.safe_push (child
);
1329 /* If the SLP build failed fatally and we analyze a basic-block
1330 simply treat nodes we fail to build as externally defined
1331 (and thus build vectors from the scalar defs).
1332 The cost model will reject outright expensive cases.
1333 ??? This doesn't treat cases where permutation ultimatively
1334 fails (or we don't try permutation below). Ideally we'd
1335 even compute a permutation that will end up with the maximum
1337 if (is_a
<bb_vec_info
> (vinfo
)
1339 /* ??? Rejecting patterns this way doesn't work. We'd have to
1340 do extra work to cancel the pattern so the uses see the
1342 && !is_pattern_stmt_p (stmt_info
)
1343 && !oprnd_info
->any_pattern
)
1345 if (dump_enabled_p ())
1346 dump_printf_loc (MSG_NOTE
, vect_location
,
1347 "Building vector operands from scalars\n");
1349 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1350 children
.safe_push (child
);
1351 oprnd_info
->ops
= vNULL
;
1352 oprnd_info
->def_stmts
= vNULL
;
1356 /* If the SLP build for operand zero failed and operand zero
1357 and one can be commutated try that for the scalar stmts
1358 that failed the match. */
1360 /* A first scalar stmt mismatch signals a fatal mismatch. */
1362 /* ??? For COND_EXPRs we can swap the comparison operands
1363 as well as the arms under some constraints. */
1365 && oprnds_info
[1]->first_dt
== vect_internal_def
1366 && is_gimple_assign (stmt_info
->stmt
)
1367 /* Swapping operands for reductions breaks assumptions later on. */
1368 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1369 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
1370 /* Do so only if the number of not successful permutes was nor more
1371 than a cut-ff as re-trying the recursive match on
1372 possibly each level of the tree would expose exponential
1376 /* See whether we can swap the matching or the non-matching
1378 bool swap_not_matching
= true;
1381 for (j
= 0; j
< group_size
; ++j
)
1383 if (matches
[j
] != !swap_not_matching
)
1385 stmt_vec_info stmt_info
= stmts
[j
];
1386 /* Verify if we can swap operands of this stmt. */
1387 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1389 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1391 if (!swap_not_matching
)
1393 swap_not_matching
= false;
1398 while (j
!= group_size
);
1400 /* Swap mismatched definition stmts. */
1401 if (dump_enabled_p ())
1402 dump_printf_loc (MSG_NOTE
, vect_location
,
1403 "Re-trying with swapped operands of stmts ");
1404 for (j
= 0; j
< group_size
; ++j
)
1405 if (matches
[j
] == !swap_not_matching
)
1407 std::swap (oprnds_info
[0]->def_stmts
[j
],
1408 oprnds_info
[1]->def_stmts
[j
]);
1409 std::swap (oprnds_info
[0]->ops
[j
],
1410 oprnds_info
[1]->ops
[j
]);
1411 if (dump_enabled_p ())
1412 dump_printf (MSG_NOTE
, "%d ", j
);
1414 if (dump_enabled_p ())
1415 dump_printf (MSG_NOTE
, "\n");
1416 /* And try again with scratch 'matches' ... */
1417 bool *tem
= XALLOCAVEC (bool, group_size
);
1418 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1419 group_size
, &this_max_nunits
,
1421 &this_tree_size
, bst_map
)) != NULL
)
1423 oprnd_info
->def_stmts
= vNULL
;
1424 children
.safe_push (child
);
1432 gcc_assert (child
== NULL
);
1433 FOR_EACH_VEC_ELT (children
, j
, child
)
1434 vect_free_slp_tree (child
, false);
1435 vect_free_oprnd_info (oprnds_info
);
1439 vect_free_oprnd_info (oprnds_info
);
1441 /* If we have all children of a non-unary child built up from
1442 scalars then just throw that away, causing it built up
1445 && is_a
<bb_vec_info
> (vinfo
)
1446 /* ??? Rejecting patterns this way doesn't work. We'd have to
1447 do extra work to cancel the pattern so the uses see the
1449 && !is_pattern_stmt_p (stmt_info
))
1453 FOR_EACH_VEC_ELT (children
, j
, child
)
1454 if (SLP_TREE_DEF_TYPE (child
) != vect_external_def
)
1459 FOR_EACH_VEC_ELT (children
, j
, child
)
1460 vect_free_slp_tree (child
, false);
1462 if (dump_enabled_p ())
1463 dump_printf_loc (MSG_NOTE
, vect_location
,
1464 "Building parent vector operands from "
1465 "scalars instead\n");
1470 *tree_size
+= this_tree_size
+ 1;
1471 *max_nunits
= this_max_nunits
;
1475 /* ??? We'd likely want to either cache in bst_map sth like
1476 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1477 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1478 explicit stmts to put in so the keying on 'stmts' doesn't
1479 work (but we have the same issue with nodes that use 'ops'). */
1480 slp_tree one
= new _slp_tree
;
1481 slp_tree two
= new _slp_tree
;
1482 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1483 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1484 SLP_TREE_VECTYPE (one
) = vectype
;
1485 SLP_TREE_VECTYPE (two
) = vectype
;
1486 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1487 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1489 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1492 /* Here we record the original defs since this
1493 node represents the final lane configuration. */
1494 node
= vect_create_new_slp_node (stmts
, 2);
1495 SLP_TREE_VECTYPE (node
) = vectype
;
1496 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1497 SLP_TREE_CHILDREN (node
).quick_push (one
);
1498 SLP_TREE_CHILDREN (node
).quick_push (two
);
1499 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1500 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1501 enum tree_code ocode
= ERROR_MARK
;
1502 stmt_vec_info ostmt_info
;
1504 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1506 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1507 if (gimple_assign_rhs_code (ostmt
) != code0
)
1509 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1510 ocode
= gimple_assign_rhs_code (ostmt
);
1514 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1516 SLP_TREE_CODE (one
) = code0
;
1517 SLP_TREE_CODE (two
) = ocode
;
1518 SLP_TREE_LANES (one
) = stmts
.length ();
1519 SLP_TREE_LANES (two
) = stmts
.length ();
1520 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1521 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1525 node
= vect_create_new_slp_node (stmts
, nops
);
1526 SLP_TREE_VECTYPE (node
) = vectype
;
1527 SLP_TREE_CHILDREN (node
).splice (children
);
1531 /* Dump a single SLP tree NODE. */
1534 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1539 stmt_vec_info stmt_info
;
1542 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1543 dump_user_location_t user_loc
= loc
.get_user_location ();
1544 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1545 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1547 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1550 estimated_poly_value (node
->max_nunits
), node
->refcnt
);
1551 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1552 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1553 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1556 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1557 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1558 dump_printf (metadata
, "%T%s ", op
,
1559 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1560 dump_printf (metadata
, "}\n");
1562 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1564 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1565 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1566 dump_printf (dump_kind
, " %u", j
);
1567 dump_printf (dump_kind
, " }\n");
1569 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1571 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1572 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1573 dump_printf (dump_kind
, " %u[%u]",
1574 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1575 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
1576 dump_printf (dump_kind
, " }\n");
1578 if (SLP_TREE_CHILDREN (node
).is_empty ())
1580 dump_printf_loc (metadata
, user_loc
, "\tchildren");
1581 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1582 dump_printf (dump_kind
, " %p", (void *)child
);
1583 dump_printf (dump_kind
, "\n");
1587 debug (slp_tree node
)
1589 debug_dump_context ctx
;
1590 vect_print_slp_tree (MSG_NOTE
,
1591 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
1595 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1598 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1599 slp_tree node
, hash_set
<slp_tree
> &visited
)
1604 if (visited
.add (node
))
1607 vect_print_slp_tree (dump_kind
, loc
, node
);
1609 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1610 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
1614 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
1617 hash_set
<slp_tree
> visited
;
1618 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
1621 /* Mark the tree rooted at NODE with PURE_SLP. */
1624 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
1627 stmt_vec_info stmt_info
;
1630 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1633 if (visited
.add (node
))
1636 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1637 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
1639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1640 vect_mark_slp_stmts (child
, visited
);
1644 vect_mark_slp_stmts (slp_tree node
)
1646 hash_set
<slp_tree
> visited
;
1647 vect_mark_slp_stmts (node
, visited
);
1650 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1653 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
1656 stmt_vec_info stmt_info
;
1659 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1662 if (visited
.add (node
))
1665 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1667 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
1668 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
1669 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
1672 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1673 vect_mark_slp_stmts_relevant (child
, visited
);
1677 vect_mark_slp_stmts_relevant (slp_tree node
)
1679 hash_set
<slp_tree
> visited
;
1680 vect_mark_slp_stmts_relevant (node
, visited
);
1683 /* Copy the SLP subtree rooted at NODE. */
1686 slp_copy_subtree (slp_tree node
, hash_map
<slp_tree
, slp_tree
> &map
)
1691 slp_tree
©_ref
= map
.get_or_insert (node
, &existed_p
);
1695 copy_ref
= new _slp_tree
;
1696 slp_tree copy
= copy_ref
;
1697 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
1698 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
1699 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
1700 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
1701 copy
->max_nunits
= node
->max_nunits
;
1703 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1705 SLP_TREE_SCALAR_STMTS (copy
) = SLP_TREE_SCALAR_STMTS (node
).copy ();
1706 stmt_vec_info stmt_info
;
1707 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1708 STMT_VINFO_NUM_SLP_USES (stmt_info
)++;
1710 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1711 SLP_TREE_SCALAR_OPS (copy
) = SLP_TREE_SCALAR_OPS (node
).copy ();
1712 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1713 SLP_TREE_LOAD_PERMUTATION (copy
) = SLP_TREE_LOAD_PERMUTATION (node
).copy ();
1714 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1715 SLP_TREE_LANE_PERMUTATION (copy
) = SLP_TREE_LANE_PERMUTATION (node
).copy ();
1716 if (SLP_TREE_CHILDREN (node
).exists ())
1717 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
).copy ();
1718 gcc_assert (!SLP_TREE_VEC_STMTS (node
).exists ());
1721 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy
), i
, child
)
1723 SLP_TREE_CHILDREN (copy
)[i
] = slp_copy_subtree (child
, map
);
1724 SLP_TREE_CHILDREN (copy
)[i
]->refcnt
++;
1729 /* Rearrange the statements of NODE according to PERMUTATION. */
1732 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1733 vec
<unsigned> permutation
,
1734 hash_set
<slp_tree
> &visited
)
1739 if (visited
.add (node
))
1742 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1743 vect_slp_rearrange_stmts (child
, group_size
, permutation
, visited
);
1745 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1747 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1748 vec
<stmt_vec_info
> tmp_stmts
;
1749 tmp_stmts
.create (group_size
);
1750 tmp_stmts
.quick_grow (group_size
);
1751 stmt_vec_info stmt_info
;
1752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1753 tmp_stmts
[permutation
[i
]] = stmt_info
;
1754 SLP_TREE_SCALAR_STMTS (node
).release ();
1755 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1757 if (SLP_TREE_SCALAR_OPS (node
).exists ())
1759 gcc_assert (group_size
== SLP_TREE_SCALAR_OPS (node
).length ());
1761 tmp_ops
.create (group_size
);
1762 tmp_ops
.quick_grow (group_size
);
1764 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1765 tmp_ops
[permutation
[i
]] = op
;
1766 SLP_TREE_SCALAR_OPS (node
).release ();
1767 SLP_TREE_SCALAR_OPS (node
) = tmp_ops
;
1769 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1771 gcc_assert (group_size
== SLP_TREE_LANE_PERMUTATION (node
).length ());
1772 for (i
= 0; i
< group_size
; ++i
)
1773 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
1774 = permutation
[SLP_TREE_LANE_PERMUTATION (node
)[i
].second
];
1779 /* Attempt to reorder stmts in a reduction chain so that we don't
1780 require any load permutation. Return true if that was possible,
1781 otherwise return false. */
1784 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn
)
1788 slp_tree node
, load
;
1790 if (SLP_INSTANCE_LOADS (slp_instn
).is_empty ())
1793 /* Compare all the permutation sequences to the first one. We know
1794 that at least one load is permuted. */
1795 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1796 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1798 unsigned int group_size
= SLP_TREE_LOAD_PERMUTATION (node
).length ();
1799 for (i
= 1; SLP_INSTANCE_LOADS (slp_instn
).iterate (i
, &load
); ++i
)
1801 if (!SLP_TREE_LOAD_PERMUTATION (load
).exists ()
1802 || SLP_TREE_LOAD_PERMUTATION (load
).length () != group_size
)
1804 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load
), j
, lidx
)
1805 if (lidx
!= SLP_TREE_LOAD_PERMUTATION (node
)[j
])
1809 /* Check that the loads in the first sequence are different and there
1810 are no gaps between them. */
1811 auto_sbitmap
load_index (group_size
);
1812 bitmap_clear (load_index
);
1813 FOR_EACH_VEC_ELT (node
->load_permutation
, i
, lidx
)
1815 if (lidx
>= group_size
)
1817 if (bitmap_bit_p (load_index
, lidx
))
1820 bitmap_set_bit (load_index
, lidx
);
1822 for (i
= 0; i
< group_size
; i
++)
1823 if (!bitmap_bit_p (load_index
, i
))
1826 /* This permutation is valid for reduction. Since the order of the
1827 statements in the nodes is not important unless they are memory
1828 accesses, we can rearrange the statements in all the nodes
1829 according to the order of the loads. */
1831 /* We have to unshare the SLP tree we modify. */
1832 hash_map
<slp_tree
, slp_tree
> map
;
1833 slp_tree unshared
= slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn
), map
);
1834 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn
), false);
1836 SLP_INSTANCE_TREE (slp_instn
) = unshared
;
1837 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1838 SLP_INSTANCE_LOADS (slp_instn
)[i
] = *map
.get (node
);
1839 node
= SLP_INSTANCE_LOADS (slp_instn
)[0];
1841 /* Do the actual re-arrangement. */
1842 hash_set
<slp_tree
> visited
;
1843 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1844 node
->load_permutation
, visited
);
1846 /* We are done, no actual permutations need to be generated. */
1847 poly_uint64 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_instn
);
1848 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1850 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1851 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
1852 /* But we have to keep those permutations that are required because
1853 of handling of gaps. */
1854 if (known_eq (unrolling_factor
, 1U)
1855 || (group_size
== DR_GROUP_SIZE (first_stmt_info
)
1856 && DR_GROUP_GAP (first_stmt_info
) == 0))
1857 SLP_TREE_LOAD_PERMUTATION (node
).release ();
1859 for (j
= 0; j
< SLP_TREE_LOAD_PERMUTATION (node
).length (); ++j
)
1860 SLP_TREE_LOAD_PERMUTATION (node
)[j
] = j
;
1866 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1869 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
1870 hash_set
<slp_tree
> &visited
)
1872 if (visited
.add (node
))
1875 if (SLP_TREE_CHILDREN (node
).length () == 0)
1877 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
1879 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
1880 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1881 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1882 loads
.safe_push (node
);
1888 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1889 vect_gather_slp_loads (loads
, child
, visited
);
1894 vect_gather_slp_loads (slp_instance inst
, slp_tree node
)
1896 hash_set
<slp_tree
> visited
;
1897 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst
), node
, visited
);
1901 /* Find the last store in SLP INSTANCE. */
1904 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
1906 stmt_vec_info last
= NULL
;
1907 stmt_vec_info stmt_vinfo
;
1909 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
1911 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
1912 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
1918 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1919 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1920 (also containing the first GROUP1_SIZE stmts, since stores are
1921 consecutive), the second containing the remainder.
1922 Return the first stmt in the second group. */
1924 static stmt_vec_info
1925 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
1927 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
1928 gcc_assert (group1_size
> 0);
1929 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
1930 gcc_assert (group2_size
> 0);
1931 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
1933 stmt_vec_info stmt_info
= first_vinfo
;
1934 for (unsigned i
= group1_size
; i
> 1; i
--)
1936 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
1937 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
1939 /* STMT is now the last element of the first group. */
1940 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
1941 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
1943 DR_GROUP_SIZE (group2
) = group2_size
;
1944 for (stmt_info
= group2
; stmt_info
;
1945 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
1947 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
1948 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
1951 /* For the second group, the DR_GROUP_GAP is that before the original group,
1952 plus skipping over the first vector. */
1953 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
1955 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1956 DR_GROUP_GAP (first_vinfo
) += group2_size
;
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
1960 group1_size
, group2_size
);
1965 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1966 statements and a vector of NUNITS elements. */
1969 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
1971 return exact_div (common_multiple (nunits
, group_size
), group_size
);
1974 /* Analyze an SLP instance starting from a group of grouped stores. Call
1975 vect_build_slp_tree to build a tree of packed stmts if possible.
1976 Return FALSE if it's impossible to SLP any stmt in the loop. */
1979 vect_analyze_slp_instance (vec_info
*vinfo
,
1980 scalar_stmts_to_slp_tree_map_t
*bst_map
,
1981 stmt_vec_info stmt_info
, unsigned max_tree_size
)
1983 slp_instance new_instance
;
1985 unsigned int group_size
;
1986 tree vectype
, scalar_type
= NULL_TREE
;
1988 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
1989 vec
<stmt_vec_info
> scalar_stmts
;
1990 bool constructor
= false;
1992 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1994 scalar_type
= TREE_TYPE (DR_REF (dr
));
1995 group_size
= DR_GROUP_SIZE (stmt_info
);
1996 vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
, group_size
);
1998 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2000 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2001 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2002 group_size
= REDUC_GROUP_SIZE (stmt_info
);
2004 else if (is_gimple_assign (stmt_info
->stmt
)
2005 && gimple_assign_rhs_code (stmt_info
->stmt
) == CONSTRUCTOR
)
2007 vectype
= TREE_TYPE (gimple_assign_rhs1 (stmt_info
->stmt
));
2008 group_size
= CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info
->stmt
));
2013 gcc_assert (is_a
<loop_vec_info
> (vinfo
));
2014 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2015 group_size
= as_a
<loop_vec_info
> (vinfo
)->reductions
.length ();
2020 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2022 "Build SLP failed: unsupported data-type %T\n",
2027 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2029 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2030 scalar_stmts
.create (group_size
);
2031 stmt_vec_info next_info
= stmt_info
;
2032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2034 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2037 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2038 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2041 else if (!dr
&& REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2043 /* Collect the reduction stmts and store them in
2044 SLP_TREE_SCALAR_STMTS. */
2047 scalar_stmts
.safe_push (vect_stmt_to_vectorize (next_info
));
2048 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2050 /* Mark the first element of the reduction chain as reduction to properly
2051 transform the node. In the reduction analysis phase only the last
2052 element of the chain is marked as reduction. */
2053 STMT_VINFO_DEF_TYPE (stmt_info
)
2054 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2055 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2056 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2058 else if (constructor
)
2060 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2062 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2064 if (TREE_CODE (val
) == SSA_NAME
)
2066 gimple
* def
= SSA_NAME_DEF_STMT (val
);
2067 stmt_vec_info def_info
= vinfo
->lookup_stmt (def
);
2068 /* Value is defined in another basic block. */
2071 def_info
= vect_stmt_to_vectorize (def_info
);
2072 scalar_stmts
.safe_push (def_info
);
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_NOTE
, vect_location
,
2079 "Analyzing vectorizable constructor: %G\n",
2084 /* Collect reduction statements. */
2085 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2086 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2087 scalar_stmts
.safe_push (next_info
);
2090 /* Build the tree for the SLP instance. */
2091 bool *matches
= XALLOCAVEC (bool, group_size
);
2092 unsigned npermutes
= 0;
2093 poly_uint64 max_nunits
= nunits
;
2094 unsigned tree_size
= 0;
2095 node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2096 &max_nunits
, matches
, &npermutes
,
2097 &tree_size
, bst_map
);
2100 /* Calculate the unrolling factor based on the smallest type. */
2101 poly_uint64 unrolling_factor
2102 = calculate_unrolling_factor (max_nunits
, group_size
);
2104 if (maybe_ne (unrolling_factor
, 1U)
2105 && is_a
<bb_vec_info
> (vinfo
))
2107 unsigned HOST_WIDE_INT const_max_nunits
;
2108 if (!max_nunits
.is_constant (&const_max_nunits
)
2109 || const_max_nunits
> group_size
)
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2113 "Build SLP failed: store group "
2114 "size not a multiple of the vector size "
2115 "in basic block SLP\n");
2116 vect_free_slp_tree (node
, false);
2119 /* Fatal mismatch. */
2120 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2121 vect_free_slp_tree (node
, false);
2125 /* Create a new SLP instance. */
2126 new_instance
= XNEW (class _slp_instance
);
2127 SLP_INSTANCE_TREE (new_instance
) = node
;
2128 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2129 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2130 SLP_INSTANCE_ROOT_STMT (new_instance
) = constructor
? stmt_info
: NULL
;
2132 vect_gather_slp_loads (new_instance
, node
);
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_NOTE
, vect_location
,
2135 "SLP size %u vs. limit %u.\n",
2136 tree_size
, max_tree_size
);
2138 /* Check whether any load is possibly permuted. */
2140 bool loads_permuted
= false;
2141 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2143 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
2146 stmt_vec_info load_info
;
2147 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load_info
)
2148 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[j
] != j
)
2150 loads_permuted
= true;
2155 /* If the loads and stores can be handled with load/store-lane
2156 instructions do not generate this SLP instance. */
2157 if (is_a
<loop_vec_info
> (vinfo
)
2159 && dr
&& vect_store_lanes_supported (vectype
, group_size
, false))
2162 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance
), i
, load_node
)
2164 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
2165 (SLP_TREE_SCALAR_STMTS (load_node
)[0]);
2166 /* Use SLP for strided accesses (or if we can't load-lanes). */
2167 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
2168 || ! vect_load_lanes_supported
2169 (STMT_VINFO_VECTYPE (stmt_vinfo
),
2170 DR_GROUP_SIZE (stmt_vinfo
), false))
2173 if (i
== SLP_INSTANCE_LOADS (new_instance
).length ())
2175 if (dump_enabled_p ())
2176 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2177 "Built SLP cancelled: can use "
2178 "load/store-lanes\n");
2179 vect_free_slp_instance (new_instance
, false);
2184 /* If this is a reduction chain with a conversion in front
2185 amend the SLP tree with a node for that. */
2187 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
2188 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
)
2190 /* Get at the conversion stmt - we know it's the single use
2191 of the last stmt of the reduction chain. */
2192 gimple
*tem
= vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2193 use_operand_p use_p
;
2195 bool r
= single_imm_use (gimple_assign_lhs (tem
),
2198 next_info
= vinfo
->lookup_stmt (use_stmt
);
2199 next_info
= vect_stmt_to_vectorize (next_info
);
2200 scalar_stmts
= vNULL
;
2201 scalar_stmts
.create (group_size
);
2202 for (unsigned i
= 0; i
< group_size
; ++i
)
2203 scalar_stmts
.quick_push (next_info
);
2204 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2205 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2206 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2207 SLP_INSTANCE_TREE (new_instance
) = conv
;
2208 /* We also have to fake this conversion stmt as SLP reduction
2209 group so we don't have to mess with too much code
2211 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2212 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2215 vinfo
->slp_instances
.safe_push (new_instance
);
2217 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2218 the number of scalar stmts in the root in a few places.
2219 Verify that assumption holds. */
2220 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2221 .length () == group_size
);
2223 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_NOTE
, vect_location
,
2226 "Final SLP tree for instance:\n");
2227 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2228 SLP_INSTANCE_TREE (new_instance
));
2236 /* Failed to SLP. */
2237 /* Free the allocated memory. */
2238 scalar_stmts
.release ();
2241 /* For basic block SLP, try to break the group up into multiples of the
2243 unsigned HOST_WIDE_INT const_nunits
;
2244 if (is_a
<bb_vec_info
> (vinfo
)
2245 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2246 && DR_GROUP_FIRST_ELEMENT (stmt_info
)
2247 && nunits
.is_constant (&const_nunits
))
2249 /* We consider breaking the group only on VF boundaries from the existing
2251 for (i
= 0; i
< group_size
; i
++)
2252 if (!matches
[i
]) break;
2254 if (i
>= const_nunits
&& i
< group_size
)
2256 /* Split into two groups at the first vector boundary before i. */
2257 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2258 unsigned group1_size
= i
& ~(const_nunits
- 1);
2260 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2262 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2264 /* If the first non-match was in the middle of a vector,
2265 skip the rest of that vector. */
2266 if (group1_size
< i
)
2268 i
= group1_size
+ const_nunits
;
2270 rest
= vect_split_slp_store_group (rest
, const_nunits
);
2273 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2274 rest
, max_tree_size
);
2277 /* Even though the first vector did not all match, we might be able to SLP
2278 (some) of the remainder. FORNOW ignore this possibility. */
2285 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2286 trees of packed scalar stmts if SLP is possible. */
2289 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2292 stmt_vec_info first_element
;
2294 DUMP_VECT_SCOPE ("vect_analyze_slp");
2296 scalar_stmts_to_slp_tree_map_t
*bst_map
2297 = new scalar_stmts_to_slp_tree_map_t ();
2299 /* Find SLP sequences starting from groups of grouped stores. */
2300 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2301 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
, max_tree_size
);
2303 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2305 if (loop_vinfo
->reduction_chains
.length () > 0)
2307 /* Find SLP sequences starting from reduction chains. */
2308 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2309 if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2312 /* Dissolve reduction chain group. */
2313 stmt_vec_info vinfo
= first_element
;
2314 stmt_vec_info last
= NULL
;
2317 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2318 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2319 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2323 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2324 /* It can be still vectorized as part of an SLP reduction. */
2325 loop_vinfo
->reductions
.safe_push (last
);
2329 /* Find SLP sequences starting from groups of reductions. */
2330 if (loop_vinfo
->reductions
.length () > 1)
2331 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2335 /* The map keeps a reference on SLP nodes built, release that. */
2336 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2337 it
!= bst_map
->end (); ++it
)
2339 vect_free_slp_tree ((*it
).second
, false);
2342 /* Optimize permutations in SLP reductions. */
2343 slp_instance instance
;
2344 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2346 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2347 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2348 /* Reduction (there are no data-refs in the root).
2349 In reduction chain the order of the loads is not important. */
2350 if (!STMT_VINFO_DATA_REF (stmt_info
)
2351 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2352 vect_attempt_slp_rearrange_stmts (instance
);
2355 /* Gather all loads in the SLP graph. */
2356 hash_set
<slp_tree
> visited
;
2357 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
2358 vect_gather_slp_loads (vinfo
->slp_loads
, SLP_INSTANCE_TREE (instance
),
2361 return opt_result::success ();
2365 vect_optimize_slp (vec_info
*vinfo
)
2369 FOR_EACH_VEC_ELT (vinfo
->slp_loads
, i
, node
)
2371 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2374 /* In basic block vectorization we allow any subchain of an interleaving
2376 FORNOW: not in loop SLP because of realignment complications. */
2377 if (is_a
<bb_vec_info
> (vinfo
))
2379 bool subchain_p
= true;
2380 stmt_vec_info next_load_info
= NULL
;
2381 stmt_vec_info load_info
;
2383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2386 && (next_load_info
!= load_info
2387 || DR_GROUP_GAP (load_info
) != 1))
2392 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
2396 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2402 stmt_vec_info load_info
;
2403 bool this_load_permuted
= false;
2405 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2406 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
2408 this_load_permuted
= true;
2411 stmt_vec_info first_stmt_info
2412 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
2413 if (!this_load_permuted
2414 /* The load requires permutation when unrolling exposes
2415 a gap either because the group is larger than the SLP
2416 group-size or because there is a gap between the groups. */
2417 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a
<loop_vec_info
> (vinfo
)), 1U)
2418 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
2419 && DR_GROUP_GAP (first_stmt_info
) == 0)))
2421 SLP_TREE_LOAD_PERMUTATION (node
).release ();
2429 /* For each possible SLP instance decide whether to SLP it and calculate overall
2430 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2431 least one instance. */
2434 vect_make_slp_decision (loop_vec_info loop_vinfo
)
2437 poly_uint64 unrolling_factor
= 1;
2438 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2439 slp_instance instance
;
2440 int decided_to_slp
= 0;
2442 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2444 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2446 /* FORNOW: SLP if you can. */
2447 /* All unroll factors have the form:
2449 GET_MODE_SIZE (vinfo->vector_mode) * X
2451 for some rational X, so they must have a common multiple. */
2453 = force_common_multiple (unrolling_factor
,
2454 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
2456 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2457 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2458 loop-based vectorization. Such stmts will be marked as HYBRID. */
2459 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
2463 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
2465 if (decided_to_slp
&& dump_enabled_p ())
2467 dump_printf_loc (MSG_NOTE
, vect_location
,
2468 "Decided to SLP %d instances. Unrolling factor ",
2470 dump_dec (MSG_NOTE
, unrolling_factor
);
2471 dump_printf (MSG_NOTE
, "\n");
2474 return (decided_to_slp
> 0);
2477 /* Private data for vect_detect_hybrid_slp. */
2480 loop_vec_info loop_vinfo
;
2481 vec
<stmt_vec_info
> *worklist
;
2484 /* Walker for walk_gimple_op. */
2487 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
2489 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
2490 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
2495 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
2498 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
2499 if (PURE_SLP_STMT (def_stmt_info
))
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
2503 def_stmt_info
->stmt
);
2504 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
2505 dat
->worklist
->safe_push (def_stmt_info
);
2511 /* Find stmts that must be both vectorized and SLPed. */
2514 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
2516 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2518 /* All stmts participating in SLP are marked pure_slp, all other
2519 stmts are loop_vect.
2520 First collect all loop_vect stmts into a worklist. */
2521 auto_vec
<stmt_vec_info
> worklist
;
2522 for (unsigned i
= 0; i
< LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
; ++i
)
2524 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
2525 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
2528 gphi
*phi
= gsi
.phi ();
2529 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
2530 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2531 worklist
.safe_push (stmt_info
);
2533 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
2536 gimple
*stmt
= gsi_stmt (gsi
);
2537 if (is_gimple_debug (stmt
))
2539 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
2540 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
2542 for (gimple_stmt_iterator gsi2
2543 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
2544 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
2546 stmt_vec_info patt_info
2547 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
2548 if (!STMT_SLP_TYPE (patt_info
)
2549 && STMT_VINFO_RELEVANT (patt_info
))
2550 worklist
.safe_push (patt_info
);
2552 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
2554 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
2555 worklist
.safe_push (stmt_info
);
2559 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2560 mark any SLP vectorized stmt as hybrid.
2561 ??? We're visiting def stmts N times (once for each non-SLP and
2562 once for each hybrid-SLP use). */
2565 dat
.worklist
= &worklist
;
2566 dat
.loop_vinfo
= loop_vinfo
;
2567 memset (&wi
, 0, sizeof (wi
));
2568 wi
.info
= (void *)&dat
;
2569 while (!worklist
.is_empty ())
2571 stmt_vec_info stmt_info
= worklist
.pop ();
2572 /* Since SSA operands are not set up for pattern stmts we need
2573 to use walk_gimple_op. */
2575 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
2580 /* Initialize a bb_vec_info struct for the statements between
2581 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2583 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in
,
2584 gimple_stmt_iterator region_end_in
,
2585 vec_info_shared
*shared
)
2586 : vec_info (vec_info::bb
, init_cost (NULL
), shared
),
2587 bb (gsi_bb (region_begin_in
)),
2588 region_begin (region_begin_in
),
2589 region_end (region_end_in
)
2591 for (gimple
*stmt
: this->region_stmts ())
2593 gimple_set_uid (stmt
, 0);
2594 if (is_gimple_debug (stmt
))
2603 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2604 stmts in the basic block. */
2606 _bb_vec_info::~_bb_vec_info ()
2608 for (gimple
*stmt
: this->region_stmts ())
2609 /* Reset region marker. */
2610 gimple_set_uid (stmt
, -1);
2615 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2616 given then that child nodes have already been processed, and that
2617 their def types currently match their SLP node's def type. */
2620 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
2621 slp_instance node_instance
,
2622 stmt_vector_for_cost
*cost_vec
)
2624 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
2625 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
2627 /* Calculate the number of vector statements to be created for the
2628 scalar stmts in this node. For SLP reductions it is equal to the
2629 number of vector statements in the children (which has already been
2630 calculated by the recursive call). Otherwise it is the number of
2631 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2632 VF divided by the number of elements in a vector. */
2633 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2634 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
2636 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
2637 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
2639 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2640 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
2647 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2648 vf
= loop_vinfo
->vectorization_factor
;
2651 unsigned int group_size
= SLP_TREE_LANES (node
);
2652 tree vectype
= SLP_TREE_VECTYPE (node
);
2653 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
2654 = vect_get_num_vectors (vf
* group_size
, vectype
);
2657 /* Handle purely internal nodes. */
2658 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2659 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
2662 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
2663 node
, node_instance
, cost_vec
);
2666 /* Try to build NODE from scalars, returning true on success.
2667 NODE_INSTANCE is the SLP instance that contains NODE. */
2670 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
2671 slp_instance node_instance
)
2673 stmt_vec_info stmt_info
;
2676 if (!is_a
<bb_vec_info
> (vinfo
)
2677 || node
== SLP_INSTANCE_TREE (node_instance
)
2678 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
2681 if (dump_enabled_p ())
2682 dump_printf_loc (MSG_NOTE
, vect_location
,
2683 "Building vector operands from scalars instead\n");
2685 /* Don't remove and free the child nodes here, since they could be
2686 referenced by other structures. The analysis and scheduling phases
2687 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2688 unsigned int group_size
= SLP_TREE_LANES (node
);
2689 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
2690 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
);
2691 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2693 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
2694 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
2699 /* Compute the prologue cost for invariant or constant operands represented
2703 vect_prologue_cost_for_slp (slp_tree node
,
2704 stmt_vector_for_cost
*cost_vec
)
2706 /* Without looking at the actual initializer a vector of
2707 constants can be implemented as load from the constant pool.
2708 When all elements are the same we can use a splat. */
2709 tree vectype
= SLP_TREE_VECTYPE (node
);
2710 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
2711 unsigned num_vects_to_check
;
2712 unsigned HOST_WIDE_INT const_nunits
;
2713 unsigned nelt_limit
;
2714 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
2715 && ! multiple_p (const_nunits
, group_size
))
2717 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
2718 nelt_limit
= const_nunits
;
2722 /* If either the vector has variable length or the vectors
2723 are composed of repeated whole groups we only need to
2724 cost construction once. All vectors will be the same. */
2725 num_vects_to_check
= 1;
2726 nelt_limit
= group_size
;
2728 tree elt
= NULL_TREE
;
2730 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
2732 unsigned si
= j
% group_size
;
2734 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
2735 /* ??? We're just tracking whether all operands of a single
2736 vector initializer are the same, ideally we'd check if
2737 we emitted the same one already. */
2738 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
2741 if (nelt
== nelt_limit
)
2743 record_stmt_cost (cost_vec
, 1,
2744 SLP_TREE_DEF_TYPE (node
) == vect_external_def
2745 ? (elt
? scalar_to_vec
: vec_construct
)
2747 NULL
, vectype
, 0, vect_prologue
);
2753 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2754 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2756 Return true if the operations are supported. */
2759 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
2760 slp_instance node_instance
,
2761 hash_set
<slp_tree
> &visited
,
2762 hash_set
<slp_tree
> &lvisited
,
2763 stmt_vector_for_cost
*cost_vec
)
2768 /* Assume we can code-generate all invariants. */
2769 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2772 /* If we already analyzed the exact same set of scalar stmts we're done.
2773 We share the generated vector stmts for those.
2774 The SLP graph is acyclic so not caching whether we failed or succeeded
2775 doesn't result in any issue since we throw away the lvisited set
2777 if (visited
.contains (node
)
2778 || lvisited
.add (node
))
2781 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2782 if (!vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
2783 visited
, lvisited
, cost_vec
))
2786 bool res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
2789 /* When the node can be vectorized cost invariant nodes it references.
2790 This is not done in DFS order to allow the refering node
2791 vectorizable_* calls to nail down the invariant nodes vector type
2792 and possibly unshare it if it needs a different vector type than
2795 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2796 if ((SLP_TREE_DEF_TYPE (child
) == vect_constant_def
2797 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2798 /* Perform usual caching, note code-generation still
2799 code-gens these nodes multiple times but we expect
2800 to CSE them later. */
2801 && !visited
.contains (child
)
2802 && !lvisited
.add (child
))
2804 /* ??? After auditing more code paths make a "default"
2805 and push the vector type from NODE to all children
2806 if it is not already set. */
2807 /* Compute the number of vectors to be generated. */
2808 tree vector_type
= SLP_TREE_VECTYPE (child
);
2811 /* For shifts with a scalar argument we don't need
2812 to cost or code-generate anything.
2813 ??? Represent this more explicitely. */
2814 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
2815 == shift_vec_info_type
)
2819 unsigned group_size
= SLP_TREE_SCALAR_OPS (child
).length ();
2821 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2822 vf
= loop_vinfo
->vectorization_factor
;
2823 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
2824 = vect_get_num_vectors (vf
* group_size
, vector_type
);
2825 /* And cost them. */
2826 vect_prologue_cost_for_slp (child
, cost_vec
);
2829 /* If this node can't be vectorized, try pruning the tree here rather
2830 than felling the whole thing. */
2831 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
2833 /* We'll need to revisit this for invariant costing and number
2834 of vectorized stmt setting. */
2835 lvisited
.remove (node
);
2843 /* Analyze statements in SLP instances of VINFO. Return true if the
2844 operations are supported. */
2847 vect_slp_analyze_operations (vec_info
*vinfo
)
2849 slp_instance instance
;
2852 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2854 hash_set
<slp_tree
> visited
;
2855 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
2857 hash_set
<slp_tree
> lvisited
;
2858 stmt_vector_for_cost cost_vec
;
2859 cost_vec
.create (2);
2860 if (!vect_slp_analyze_node_operations (vinfo
,
2861 SLP_INSTANCE_TREE (instance
),
2862 instance
, visited
, lvisited
,
2864 /* Instances with a root stmt require vectorized defs for the
2866 || (SLP_INSTANCE_ROOT_STMT (instance
)
2867 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
2868 != vect_internal_def
)))
2870 slp_tree node
= SLP_INSTANCE_TREE (instance
);
2871 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_NOTE
, vect_location
,
2874 "removing SLP instance operations starting from: %G",
2876 vect_free_slp_instance (instance
, false);
2877 vinfo
->slp_instances
.ordered_remove (i
);
2878 cost_vec
.release ();
2882 for (hash_set
<slp_tree
>::iterator x
= lvisited
.begin();
2883 x
!= lvisited
.end(); ++x
)
2887 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
2888 cost_vec
.release ();
2892 return !vinfo
->slp_instances
.is_empty ();
2896 /* Compute the scalar cost of the SLP node NODE and its children
2897 and return it. Do not account defs that are marked in LIFE and
2898 update LIFE according to uses of NODE. */
2901 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
2902 slp_tree node
, vec
<bool, va_heap
> *life
,
2903 stmt_vector_for_cost
*cost_vec
,
2904 hash_set
<slp_tree
> &visited
)
2907 stmt_vec_info stmt_info
;
2910 if (visited
.add (node
))
2913 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2915 gimple
*stmt
= stmt_info
->stmt
;
2916 ssa_op_iter op_iter
;
2917 def_operand_p def_p
;
2922 /* If there is a non-vectorized use of the defs then the scalar
2923 stmt is kept live in which case we do not account it or any
2924 required defs in the SLP children in the scalar cost. This
2925 way we make the vectorization more costly when compared to
2927 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
2929 imm_use_iterator use_iter
;
2931 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
2932 if (!is_gimple_debug (use_stmt
))
2934 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
2935 if (!use_stmt_info
|| !PURE_SLP_STMT (use_stmt_info
))
2938 BREAK_FROM_IMM_USE_STMT (use_iter
);
2945 /* Count scalar stmts only once. */
2946 if (gimple_visited_p (stmt
))
2948 gimple_set_visited (stmt
, true);
2950 vect_cost_for_stmt kind
;
2951 if (STMT_VINFO_DATA_REF (stmt_info
))
2953 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2956 kind
= scalar_store
;
2958 else if (vect_nop_conversion_p (stmt_info
))
2962 record_stmt_cost (cost_vec
, 1, kind
, stmt_info
, 0, vect_body
);
2965 auto_vec
<bool, 20> subtree_life
;
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2968 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2970 /* Do not directly pass LIFE to the recursive call, copy it to
2971 confine changes in the callee to the current child/subtree. */
2972 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
2974 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
));
2975 for (unsigned j
= 0;
2976 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
2978 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
2979 if (perm
.first
== i
)
2980 subtree_life
[perm
.second
] = (*life
)[j
];
2985 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
2986 subtree_life
.safe_splice (*life
);
2988 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
2990 subtree_life
.truncate (0);
2995 /* Check if vectorization of the basic block is profitable. */
2998 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
3000 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3001 slp_instance instance
;
3003 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
3004 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
3006 /* Calculate scalar cost. */
3007 stmt_vector_for_cost scalar_costs
;
3008 scalar_costs
.create (0);
3009 hash_set
<slp_tree
> visited
;
3010 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3012 auto_vec
<bool, 20> life
;
3013 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)));
3014 vect_bb_slp_scalar_cost (bb_vinfo
,
3015 SLP_INSTANCE_TREE (instance
),
3016 &life
, &scalar_costs
, visited
);
3018 /* Unset visited flag. */
3019 stmt_info_for_cost
*si
;
3020 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
3021 gimple_set_visited (si
->stmt_info
->stmt
, false);
3023 void *target_cost_data
= init_cost (NULL
);
3024 add_stmt_costs (bb_vinfo
, target_cost_data
, &scalar_costs
);
3025 scalar_costs
.release ();
3027 finish_cost (target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
3028 destroy_cost_data (target_cost_data
);
3030 /* Complete the target-specific cost calculation. */
3031 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
3032 &vec_inside_cost
, &vec_epilogue_cost
);
3034 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
3036 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
3039 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
3041 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
3042 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
3043 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
3046 /* Vectorization is profitable if its cost is more than the cost of scalar
3047 version. Note that we err on the vector side for equal cost because
3048 the cost estimate is otherwise quite pessimistic (constant uses are
3049 free on the scalar side but cost a load on the vector side for
3051 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
3057 /* Find any vectorizable constructors and add them to the grouped_store
3061 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
3063 for (gimple
*stmt
: bb_vinfo
->region_stmts ())
3065 gassign
*assign
= dyn_cast
<gassign
*> (stmt
);
3066 if (!assign
|| gimple_assign_rhs_code (assign
) != CONSTRUCTOR
)
3069 tree rhs
= gimple_assign_rhs1 (assign
);
3070 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
3071 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
3072 CONSTRUCTOR_NELTS (rhs
))
3073 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
3074 || uniform_vector_p (rhs
))
3077 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
3078 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
3082 /* Check if the region described by BB_VINFO can be vectorized, returning
3083 true if so. When returning false, set FATAL to true if the same failure
3084 would prevent vectorization at other vector sizes, false if it is still
3085 worth trying other sizes. N_STMTS is the number of statements in the
3089 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
)
3091 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3093 slp_instance instance
;
3095 poly_uint64 min_vf
= 2;
3097 /* The first group of checks is independent of the vector size. */
3100 /* Analyze the data references. */
3102 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3106 "not vectorized: unhandled data-ref in basic "
3111 if (BB_VINFO_DATAREFS (bb_vinfo
).length () < 2)
3113 if (dump_enabled_p ())
3114 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3115 "not vectorized: not enough data-refs in "
3120 if (!vect_analyze_data_ref_accesses (bb_vinfo
))
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3124 "not vectorized: unhandled data access in "
3129 vect_slp_check_for_constructors (bb_vinfo
);
3131 /* If there are no grouped stores in the region there is no need
3132 to continue with pattern recog as vect_analyze_slp will fail
3134 if (bb_vinfo
->grouped_stores
.is_empty ())
3136 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3138 "not vectorized: no grouped stores in "
3143 /* While the rest of the analysis below depends on it in some way. */
3146 vect_pattern_recog (bb_vinfo
);
3148 /* Check the SLP opportunities in the basic block, analyze and build SLP
3150 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
3152 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3155 "Failed to SLP the basic block.\n");
3156 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3157 "not vectorized: failed to find SLP opportunities "
3158 "in basic block.\n");
3163 /* Optimize permutations. */
3164 vect_optimize_slp (bb_vinfo
);
3166 vect_record_base_alignments (bb_vinfo
);
3168 /* Analyze and verify the alignment of data references and the
3169 dependence in the SLP instances. */
3170 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
3172 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo
, instance
)
3173 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
3175 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3176 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE
, vect_location
,
3179 "removing SLP instance operations starting from: %G",
3181 vect_free_slp_instance (instance
, false);
3182 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
3186 /* Mark all the statements that we want to vectorize as pure SLP and
3188 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3189 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
3190 if (SLP_INSTANCE_ROOT_STMT (instance
))
3191 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance
)) = pure_slp
;
3195 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
3198 if (!vect_slp_analyze_operations (bb_vinfo
))
3200 if (dump_enabled_p ())
3201 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3202 "not vectorized: bad operation in basic block.\n");
3206 /* Cost model: check if the vectorization is worthwhile. */
3207 if (!unlimited_cost_model (NULL
)
3208 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
3210 if (dump_enabled_p ())
3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3212 "not vectorized: vectorization is not "
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE
, vect_location
,
3219 "Basic block will be vectorized using SLP\n");
3223 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3224 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3225 on success. The region has N_STMTS statements and has the datarefs
3226 given by DATAREFS. */
3229 vect_slp_bb_region (gimple_stmt_iterator region_begin
,
3230 gimple_stmt_iterator region_end
,
3231 vec
<data_reference_p
> datarefs
,
3232 unsigned int n_stmts
)
3234 bb_vec_info bb_vinfo
;
3235 auto_vector_modes vector_modes
;
3237 /* Autodetect first vector size we try. */
3238 machine_mode next_vector_mode
= VOIDmode
;
3239 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
3240 unsigned int mode_i
= 0;
3242 vec_info_shared shared
;
3244 machine_mode autodetected_vector_mode
= VOIDmode
;
3247 bool vectorized
= false;
3249 bb_vinfo
= new _bb_vec_info (region_begin
, region_end
, &shared
);
3251 bool first_time_p
= shared
.datarefs
.is_empty ();
3252 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
3254 bb_vinfo
->shared
->save_datarefs ();
3256 bb_vinfo
->shared
->check_datarefs ();
3257 bb_vinfo
->vector_mode
= next_vector_mode
;
3259 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
)
3260 && dbg_cnt (vect_slp
))
3262 if (dump_enabled_p ())
3264 dump_printf_loc (MSG_NOTE
, vect_location
,
3265 "***** Analysis succeeded with vector mode"
3266 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
3267 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
3270 bb_vinfo
->shared
->check_datarefs ();
3271 vect_schedule_slp (bb_vinfo
);
3273 unsigned HOST_WIDE_INT bytes
;
3274 if (dump_enabled_p ())
3276 if (GET_MODE_SIZE (bb_vinfo
->vector_mode
).is_constant (&bytes
))
3277 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3278 "basic block part vectorized using %wu byte "
3279 "vectors\n", bytes
);
3281 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
3282 "basic block part vectorized using variable "
3283 "length vectors\n");
3290 if (dump_enabled_p ())
3291 dump_printf_loc (MSG_NOTE
, vect_location
,
3292 "***** Analysis failed with vector mode %s\n",
3293 GET_MODE_NAME (bb_vinfo
->vector_mode
));
3297 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
3300 while (mode_i
< vector_modes
.length ()
3301 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
3303 if (dump_enabled_p ())
3304 dump_printf_loc (MSG_NOTE
, vect_location
,
3305 "***** The result for vector mode %s would"
3307 GET_MODE_NAME (vector_modes
[mode_i
]));
3313 if (mode_i
< vector_modes
.length ()
3314 && VECTOR_MODE_P (autodetected_vector_mode
)
3315 && (related_vector_mode (vector_modes
[mode_i
],
3316 GET_MODE_INNER (autodetected_vector_mode
))
3317 == autodetected_vector_mode
)
3318 && (related_vector_mode (autodetected_vector_mode
,
3319 GET_MODE_INNER (vector_modes
[mode_i
]))
3320 == vector_modes
[mode_i
]))
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE
, vect_location
,
3324 "***** Skipping vector mode %s, which would"
3325 " repeat the analysis for %s\n",
3326 GET_MODE_NAME (vector_modes
[mode_i
]),
3327 GET_MODE_NAME (autodetected_vector_mode
));
3332 || mode_i
== vector_modes
.length ()
3333 || autodetected_vector_mode
== VOIDmode
3334 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3335 vector sizes will fail do not bother iterating. */
3339 /* Try the next biggest vector size. */
3340 next_vector_mode
= vector_modes
[mode_i
++];
3341 if (dump_enabled_p ())
3342 dump_printf_loc (MSG_NOTE
, vect_location
,
3343 "***** Re-trying analysis with vector mode %s\n",
3344 GET_MODE_NAME (next_vector_mode
));
3348 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3349 true if anything in the basic-block was vectorized. */
3352 vect_slp_bb (basic_block bb
)
3354 gimple_stmt_iterator gsi
;
3355 bool any_vectorized
= false;
3357 gsi
= gsi_after_labels (bb
);
3358 while (!gsi_end_p (gsi
))
3360 gimple_stmt_iterator region_begin
= gsi
;
3361 vec
<data_reference_p
> datarefs
= vNULL
;
3364 for (; !gsi_end_p (gsi
); gsi_next (&gsi
))
3366 gimple
*stmt
= gsi_stmt (gsi
);
3367 if (is_gimple_debug (stmt
))
3369 /* Skip leading debug stmts. */
3370 if (gsi_stmt (region_begin
) == stmt
)
3371 gsi_next (®ion_begin
);
3376 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
3377 vect_location
= stmt
;
3379 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
))
3382 if (gsi_end_p (region_begin
))
3385 /* Skip leading unhandled stmts. */
3386 if (gsi_stmt (region_begin
) == gsi_stmt (gsi
))
3392 gimple_stmt_iterator region_end
= gsi
;
3394 if (insns
> param_slp_max_insns_in_bb
)
3396 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3398 "not vectorized: too many instructions in "
3401 else if (vect_slp_bb_region (region_begin
, region_end
, datarefs
, insns
))
3402 any_vectorized
= true;
3404 if (gsi_end_p (region_end
))
3407 /* Skip the unhandled stmt. */
3411 return any_vectorized
;
3415 /* Build a variable-length vector in which the elements in ELTS are repeated
3416 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3417 RESULTS and add any new instructions to SEQ.
3419 The approach we use is:
3421 (1) Find a vector mode VM with integer elements of mode IM.
3423 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3424 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3425 from small vectors to IM.
3427 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3429 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3430 correct byte contents.
3432 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3434 We try to find the largest IM for which this sequence works, in order
3435 to cut down on the number of interleaves. */
3438 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
3439 vec
<tree
> elts
, unsigned int nresults
,
3442 unsigned int nelts
= elts
.length ();
3443 tree element_type
= TREE_TYPE (vector_type
);
3445 /* (1) Find a vector mode VM with integer elements of mode IM. */
3446 unsigned int nvectors
= 1;
3447 tree new_vector_type
;
3449 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
3450 &nvectors
, &new_vector_type
,
3454 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3455 unsigned int partial_nelts
= nelts
/ nvectors
;
3456 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
3458 tree_vector_builder partial_elts
;
3459 auto_vec
<tree
, 32> pieces (nvectors
* 2);
3460 pieces
.quick_grow (nvectors
* 2);
3461 for (unsigned int i
= 0; i
< nvectors
; ++i
)
3463 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3464 ELTS' has mode IM. */
3465 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
3466 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
3467 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
3468 tree t
= gimple_build_vector (seq
, &partial_elts
);
3469 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
3470 TREE_TYPE (new_vector_type
), t
);
3472 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3473 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
3476 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3477 correct byte contents.
3479 We need to repeat the following operation log2(nvectors) times:
3481 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3482 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3484 However, if each input repeats every N elements and the VF is
3485 a multiple of N * 2, the HI result is the same as the LO. */
3486 unsigned int in_start
= 0;
3487 unsigned int out_start
= nvectors
;
3488 unsigned int hi_start
= nvectors
/ 2;
3489 /* A bound on the number of outputs needed to produce NRESULTS results
3490 in the final iteration. */
3491 unsigned int noutputs_bound
= nvectors
* nresults
;
3492 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
3494 noutputs_bound
/= 2;
3495 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
3496 for (unsigned int i
= 0; i
< limit
; ++i
)
3499 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
3502 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
3506 tree output
= make_ssa_name (new_vector_type
);
3507 tree input1
= pieces
[in_start
+ (i
/ 2)];
3508 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
3509 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
3512 gimple_seq_add_stmt (seq
, stmt
);
3513 pieces
[out_start
+ i
] = output
;
3515 std::swap (in_start
, out_start
);
3518 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3519 results
.reserve (nresults
);
3520 for (unsigned int i
= 0; i
< nresults
; ++i
)
3522 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
3523 pieces
[in_start
+ i
]));
3525 results
.quick_push (results
[i
- nvectors
]);
3529 /* For constant and loop invariant defs in OP_NODE this function creates
3530 vector defs that will be used in the vectorized stmts and stores them
3531 to SLP_TREE_VEC_DEFS of OP_NODE. */
3534 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
3536 unsigned HOST_WIDE_INT nunits
;
3538 unsigned j
, number_of_places_left_in_vector
;
3541 int group_size
= op_node
->ops
.length ();
3542 unsigned int vec_num
, i
;
3543 unsigned number_of_copies
= 1;
3545 gimple_seq ctor_seq
= NULL
;
3546 auto_vec
<tree
, 16> permute_results
;
3548 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3549 vector_type
= SLP_TREE_VECTYPE (op_node
);
3551 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
3552 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
3553 auto_vec
<tree
> voprnds (number_of_vectors
);
3555 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3556 created vectors. It is greater than 1 if unrolling is performed.
3558 For example, we have two scalar operands, s1 and s2 (e.g., group of
3559 strided accesses of size two), while NUNITS is four (i.e., four scalars
3560 of this type can be packed in a vector). The output vector will contain
3561 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3564 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3565 containing the operands.
3567 For example, NUNITS is four as before, and the group size is 8
3568 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3569 {s5, s6, s7, s8}. */
3571 /* When using duplicate_and_interleave, we just need one element for
3572 each scalar statement. */
3573 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
3574 nunits
= group_size
;
3576 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
3578 number_of_places_left_in_vector
= nunits
;
3580 tree_vector_builder
elts (vector_type
, nunits
, 1);
3581 elts
.quick_grow (nunits
);
3582 stmt_vec_info insert_after
= NULL
;
3583 for (j
= 0; j
< number_of_copies
; j
++)
3586 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
3588 /* Create 'vect_ = {op0,op1,...,opn}'. */
3589 number_of_places_left_in_vector
--;
3591 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
3593 if (CONSTANT_CLASS_P (op
))
3595 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3597 /* Can't use VIEW_CONVERT_EXPR for booleans because
3598 of possibly different sizes of scalar value and
3600 if (integer_zerop (op
))
3601 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
3602 else if (integer_onep (op
))
3603 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
3608 op
= fold_unary (VIEW_CONVERT_EXPR
,
3609 TREE_TYPE (vector_type
), op
);
3610 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
3614 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
3616 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
3619 = build_all_ones_cst (TREE_TYPE (vector_type
));
3621 = build_zero_cst (TREE_TYPE (vector_type
));
3622 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
3623 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
3629 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
3632 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
3635 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
3639 elts
[number_of_places_left_in_vector
] = op
;
3640 if (!CONSTANT_CLASS_P (op
))
3642 /* For BB vectorization we have to compute an insert location
3643 when a def is inside the analyzed region since we cannot
3644 simply insert at the BB start in this case. */
3645 stmt_vec_info opdef
;
3646 if (TREE_CODE (orig_op
) == SSA_NAME
3647 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
3648 && is_a
<bb_vec_info
> (vinfo
)
3649 && (opdef
= vinfo
->lookup_def (orig_op
)))
3652 insert_after
= opdef
;
3654 insert_after
= get_later_stmt (insert_after
, opdef
);
3657 if (number_of_places_left_in_vector
== 0)
3660 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
3661 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
3662 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
3665 if (permute_results
.is_empty ())
3666 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
3667 elts
, number_of_vectors
,
3669 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
3674 gimple_stmt_iterator gsi
= gsi_for_stmt (insert_after
->stmt
);
3675 /* vect_init_vector inserts before. */
3677 init
= vect_init_vector (vinfo
, NULL
, vec_cst
,
3681 init
= vect_init_vector (vinfo
, NULL
, vec_cst
,
3683 if (ctor_seq
!= NULL
)
3685 gimple_stmt_iterator gsi
3686 = gsi_for_stmt (SSA_NAME_DEF_STMT (init
));
3687 gsi_insert_seq_before (&gsi
, ctor_seq
, GSI_SAME_STMT
);
3690 voprnds
.quick_push (init
);
3691 insert_after
= NULL
;
3692 number_of_places_left_in_vector
= nunits
;
3694 elts
.new_vector (vector_type
, nunits
, 1);
3695 elts
.quick_grow (nunits
);
3700 /* Since the vectors are created in the reverse order, we should invert
3702 vec_num
= voprnds
.length ();
3703 for (j
= vec_num
; j
!= 0; j
--)
3705 vop
= voprnds
[j
- 1];
3706 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
3709 /* In case that VF is greater than the unrolling factor needed for the SLP
3710 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3711 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3712 to replicate the vectors. */
3713 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
3714 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
3716 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
3719 /* Get the Ith vectorized definition from SLP_NODE. */
3722 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
3724 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
3725 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
3727 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
3730 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3733 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
3735 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
3736 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
3739 gimple
*vec_def_stmt
;
3740 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
3741 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
3744 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
3747 /* Get N vectorized definitions for SLP_NODE. */
3750 vect_get_slp_defs (vec_info
*,
3751 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
3754 n
= SLP_TREE_CHILDREN (slp_node
).length ();
3756 for (unsigned i
= 0; i
< n
; ++i
)
3758 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
3759 vec
<tree
> vec_defs
= vNULL
;
3760 vect_get_slp_defs (child
, &vec_defs
);
3761 vec_oprnds
->quick_push (vec_defs
);
3765 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3766 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3767 permute statements for the SLP node NODE. */
3770 vect_transform_slp_perm_load (vec_info
*vinfo
,
3771 slp_tree node
, vec
<tree
> dr_chain
,
3772 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
3773 bool analyze_only
, unsigned *n_perms
)
3775 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3777 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3778 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
3779 unsigned int mask_element
;
3782 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
3785 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
3787 mode
= TYPE_MODE (vectype
);
3788 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
3790 /* Initialize the vect stmts of NODE to properly insert the generated
3793 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
3794 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
3795 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
3797 /* Generate permutation masks for every NODE. Number of masks for each NODE
3798 is equal to GROUP_SIZE.
3799 E.g., we have a group of three nodes with three loads from the same
3800 location in each node, and the vector size is 4. I.e., we have a
3801 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3802 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3803 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3806 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3807 The last mask is illegal since we assume two operands for permute
3808 operation, and the mask element values can't be outside that range.
3809 Hence, the last mask must be converted into {2,5,5,5}.
3810 For the first two permutations we need the first and the second input
3811 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3812 we need the second and the third vectors: {b1,c1,a2,b2} and
3815 int vect_stmts_counter
= 0;
3816 unsigned int index
= 0;
3817 int first_vec_index
= -1;
3818 int second_vec_index
= -1;
3822 vec_perm_builder mask
;
3823 unsigned int nelts_to_build
;
3824 unsigned int nvectors_per_build
;
3825 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
3826 && multiple_p (nunits
, group_size
));
3829 /* A single vector contains a whole number of copies of the node, so:
3830 (a) all permutes can use the same mask; and
3831 (b) the permutes only need a single vector input. */
3832 mask
.new_vector (nunits
, group_size
, 3);
3833 nelts_to_build
= mask
.encoded_nelts ();
3834 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
3838 /* We need to construct a separate mask for each vector statement. */
3839 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
3840 if (!nunits
.is_constant (&const_nunits
)
3841 || !vf
.is_constant (&const_vf
))
3843 mask
.new_vector (const_nunits
, const_nunits
, 1);
3844 nelts_to_build
= const_vf
* group_size
;
3845 nvectors_per_build
= 1;
3848 unsigned int count
= mask
.encoded_nelts ();
3849 mask
.quick_grow (count
);
3850 vec_perm_indices indices
;
3852 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
3854 unsigned int iter_num
= j
/ group_size
;
3855 unsigned int stmt_num
= j
% group_size
;
3856 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
3857 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
3860 first_vec_index
= 0;
3865 /* Enforced before the loop when !repeating_p. */
3866 unsigned int const_nunits
= nunits
.to_constant ();
3867 vec_index
= i
/ const_nunits
;
3868 mask_element
= i
% const_nunits
;
3869 if (vec_index
== first_vec_index
3870 || first_vec_index
== -1)
3872 first_vec_index
= vec_index
;
3874 else if (vec_index
== second_vec_index
3875 || second_vec_index
== -1)
3877 second_vec_index
= vec_index
;
3878 mask_element
+= const_nunits
;
3882 if (dump_enabled_p ())
3883 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3884 "permutation requires at "
3885 "least three vectors %G",
3887 gcc_assert (analyze_only
);
3891 gcc_assert (mask_element
< 2 * const_nunits
);
3894 if (mask_element
!= index
)
3896 mask
[index
++] = mask_element
;
3898 if (index
== count
&& !noop_p
)
3900 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
3901 if (!can_vec_perm_const_p (mode
, indices
))
3903 if (dump_enabled_p ())
3905 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
3907 "unsupported vect permute { ");
3908 for (i
= 0; i
< count
; ++i
)
3910 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
3911 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
3913 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
3915 gcc_assert (analyze_only
);
3926 tree mask_vec
= NULL_TREE
;
3929 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
3931 if (second_vec_index
== -1)
3932 second_vec_index
= first_vec_index
;
3934 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
3936 /* Generate the permute statement if necessary. */
3937 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
3938 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
3942 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
3944 = vect_create_destination_var (gimple_assign_lhs (stmt
),
3946 perm_dest
= make_ssa_name (perm_dest
);
3948 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
3949 first_vec
, second_vec
,
3951 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
3955 /* If mask was NULL_TREE generate the requested
3956 identity transform. */
3957 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
3959 /* Store the vector statement in NODE. */
3960 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
3965 first_vec_index
= -1;
3966 second_vec_index
= -1;
3975 /* Vectorize the SLP permutations in NODE as specified
3976 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
3977 child number and lane number.
3978 Interleaving of two two-lane two-child SLP subtrees (not supported):
3979 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
3980 A blend of two four-lane two-child SLP subtrees:
3981 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
3982 Highpart of a four-lane one-child SLP subtree (not supported):
3983 [ { 0, 2 }, { 0, 3 } ]
3984 Where currently only a subset is supported by code generating below. */
3987 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
3988 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
3990 tree vectype
= SLP_TREE_VECTYPE (node
);
3992 /* ??? We currently only support all same vector input and output types
3993 while the SLP IL should really do a concat + select and thus accept
3994 arbitrary mismatches. */
3997 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3998 if (!types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
4000 if (dump_enabled_p ())
4001 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4002 "Unsupported lane permutation\n");
4006 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
4007 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
4008 if (dump_enabled_p ())
4010 dump_printf_loc (MSG_NOTE
, vect_location
,
4011 "vectorizing permutation");
4012 for (unsigned i
= 0; i
< perm
.length (); ++i
)
4013 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
4014 dump_printf (MSG_NOTE
, "\n");
4017 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
4018 if (!nunits
.is_constant ())
4020 unsigned HOST_WIDE_INT vf
= 1;
4021 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4022 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
4024 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
4025 gcc_assert (multiple_p (olanes
, nunits
));
4027 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4028 from the { SLP operand, scalar lane } permutation as recorded in the
4029 SLP node as intermediate step. This part should already work
4030 with SLP children with arbitrary number of lanes. */
4031 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
4032 auto_vec
<unsigned> active_lane
;
4033 vperm
.create (olanes
);
4034 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length ());
4035 for (unsigned i
= 0; i
< vf
; ++i
)
4037 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
4039 std::pair
<unsigned, unsigned> p
= perm
[pi
];
4040 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
4041 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
4042 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
4043 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
4044 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
4046 /* Advance to the next group. */
4047 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
4048 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
4051 if (dump_enabled_p ())
4053 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
4054 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4056 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
4057 dump_printf (MSG_NOTE
, ",");
4058 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
4059 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
4060 vperm
[i
].first
.second
);
4062 dump_printf (MSG_NOTE
, "\n");
4065 /* We can only handle two-vector permutes, everything else should
4066 be lowered on the SLP level. The following is closely inspired
4067 by vect_transform_slp_perm_load and is supposed to eventually
4069 ??? As intermediate step do code-gen in the SLP tree representation
4071 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
4072 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
4073 unsigned int const_nunits
= nunits
.to_constant ();
4074 unsigned int index
= 0;
4075 unsigned int mask_element
;
4076 vec_perm_builder mask
;
4077 mask
.new_vector (const_nunits
, const_nunits
, 1);
4078 unsigned int count
= mask
.encoded_nelts ();
4079 mask
.quick_grow (count
);
4080 vec_perm_indices indices
;
4081 unsigned nperms
= 0;
4082 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
4084 mask_element
= vperm
[i
].second
;
4085 if (first_vec
.first
== -1U
4086 || first_vec
== vperm
[i
].first
)
4087 first_vec
= vperm
[i
].first
;
4088 else if (second_vec
.first
== -1U
4089 || second_vec
== vperm
[i
].first
)
4091 second_vec
= vperm
[i
].first
;
4092 mask_element
+= const_nunits
;
4096 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4098 "permutation requires at "
4099 "least three vectors");
4104 mask
[index
++] = mask_element
;
4108 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
4110 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
4112 if (dump_enabled_p ())
4114 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
4116 "unsupported vect permute { ");
4117 for (i
= 0; i
< count
; ++i
)
4119 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
4120 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
4122 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
4131 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
4133 if (second_vec
.first
== -1U)
4134 second_vec
= first_vec
;
4136 /* Generate the permute statement if necessary. */
4137 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
4139 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
4140 slp_tree second_node
= SLP_TREE_CHILDREN (node
)[second_vec
.first
];
4142 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
4143 tree perm_dest
= make_ssa_name (vectype
);
4145 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
4146 first_def
, second_def
,
4148 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
4149 /* Store the vector statement in NODE. */
4150 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
4154 first_vec
= std::make_pair (-1U, -1U);
4155 second_vec
= std::make_pair (-1U, -1U);
4160 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
4165 /* Vectorize SLP instance tree in postorder. */
4168 vect_schedule_slp_instance (vec_info
*vinfo
,
4169 slp_tree node
, slp_instance instance
)
4171 gimple_stmt_iterator si
;
4175 /* See if we have already vectorized the node in the graph of the
4177 if ((SLP_TREE_DEF_TYPE (node
) == vect_internal_def
4178 && SLP_TREE_VEC_STMTS (node
).exists ())
4179 || SLP_TREE_VEC_DEFS (node
).exists ())
4182 /* Vectorize externals and constants. */
4183 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
4184 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
4186 /* ??? vectorizable_shift can end up using a scalar operand which is
4187 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4188 node in this case. */
4189 if (!SLP_TREE_VECTYPE (node
))
4192 vect_create_constant_vectors (vinfo
, node
);
4196 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4197 vect_schedule_slp_instance (vinfo
, child
, instance
);
4199 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
4200 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
4202 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
4203 if (dump_enabled_p ())
4204 dump_printf_loc (MSG_NOTE
, vect_location
,
4205 "------>vectorizing SLP node starting from: %G",
4208 /* Vectorized stmts go before the last scalar stmt which is where
4209 all uses are ready. */
4210 stmt_vec_info last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
4212 si
= gsi_for_stmt (last_stmt_info
->stmt
);
4215 /* Or if we do not have 1:1 matching scalar stmts emit after the
4216 children vectorized defs. */
4217 gimple
*last_stmt
= NULL
;
4218 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4219 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4221 /* We are emitting all vectorized stmts in the same place and
4222 the last one is the last.
4223 ??? Unless we have a load permutation applied and that
4224 figures to re-use an earlier generated load. */
4227 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
4229 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4234 /* For externals we have to look at all defs since their
4235 insertion place is decided per vector. */
4238 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
4239 if (TREE_CODE (vdef
) == SSA_NAME
)
4241 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
4243 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
4247 if (is_a
<gphi
*> (last_stmt
))
4248 si
= gsi_after_labels (gimple_bb (last_stmt
));
4251 si
= gsi_for_stmt (last_stmt
);
4256 bool done_p
= false;
4258 /* Handle purely internal nodes. */
4259 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4261 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4262 be shared with different SLP nodes (but usually it's the same
4263 operation apart from the case the stmt is only there for denoting
4264 the actual scalar lane defs ...). So do not call vect_transform_stmt
4265 but open-code it here (partly). */
4266 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
4271 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
4274 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4275 For loop vectorization this is done in vectorizable_call, but for SLP
4276 it needs to be deferred until end of vect_schedule_slp, because multiple
4277 SLP instances may refer to the same scalar stmt. */
4280 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
4281 slp_tree node
, hash_set
<slp_tree
> &visited
)
4284 gimple_stmt_iterator gsi
;
4288 stmt_vec_info stmt_info
;
4290 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
4293 if (visited
.add (node
))
4296 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4297 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
4299 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4301 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4302 if (!stmt
|| gimple_bb (stmt
) == NULL
)
4304 if (is_pattern_stmt_p (stmt_info
)
4305 || !PURE_SLP_STMT (stmt_info
))
4307 lhs
= gimple_call_lhs (stmt
);
4308 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
4309 gsi
= gsi_for_stmt (stmt
);
4310 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
4311 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
4316 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
4318 hash_set
<slp_tree
> visited
;
4319 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
4322 /* Vectorize the instance root. */
4325 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
4327 gassign
*rstmt
= NULL
;
4329 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
4334 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4336 tree vect_lhs
= gimple_get_lhs (child_stmt
);
4337 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4338 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
4339 TREE_TYPE (vect_lhs
)))
4340 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
4342 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
4346 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
4348 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
4351 vec
<constructor_elt
, va_gc
> *v
;
4352 vec_alloc (v
, nelts
);
4354 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
4356 CONSTRUCTOR_APPEND_ELT (v
,
4358 gimple_get_lhs (child_stmt
));
4360 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
4361 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
4362 tree r_constructor
= build_constructor (rtype
, v
);
4363 rstmt
= gimple_build_assign (lhs
, r_constructor
);
4368 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
4369 gsi_replace (&rgsi
, rstmt
, true);
4372 /* Generate vector code for all SLP instances in the loop/basic block. */
4375 vect_schedule_slp (vec_info
*vinfo
)
4377 vec
<slp_instance
> slp_instances
;
4378 slp_instance instance
;
4381 slp_instances
= vinfo
->slp_instances
;
4382 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4384 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4385 /* Schedule the tree of INSTANCE. */
4386 vect_schedule_slp_instance (vinfo
, node
, instance
);
4388 if (SLP_INSTANCE_ROOT_STMT (instance
))
4389 vectorize_slp_instance_root_stmt (node
, instance
);
4391 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_NOTE
, vect_location
,
4393 "vectorizing stmts using SLP.\n");
4396 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4398 slp_tree root
= SLP_INSTANCE_TREE (instance
);
4399 stmt_vec_info store_info
;
4402 /* Remove scalar call stmts. Do not do this for basic-block
4403 vectorization as not all uses may be vectorized.
4404 ??? Why should this be necessary? DCE should be able to
4405 remove the stmts itself.
4406 ??? For BB vectorization we can as well remove scalar
4407 stmts starting from the SLP tree root if they have no
4409 if (is_a
<loop_vec_info
> (vinfo
))
4410 vect_remove_slp_scalar_calls (vinfo
, root
);
4412 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
4414 if (!STMT_VINFO_DATA_REF (store_info
))
4417 if (SLP_INSTANCE_ROOT_STMT (instance
))
4420 store_info
= vect_orig_stmt (store_info
);
4421 /* Free the attached stmt_vec_info and remove the stmt. */
4422 vinfo
->remove_stmt (store_info
);