1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "basic-block.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-pass.h"
36 #include "recog.h" /* FIXME: for insn_data */
38 #include "tree-vectorizer.h"
39 #include "langhooks.h"
41 /* Extract the location of the basic block in the source code.
42 Return the basic block location if succeed and NULL if not. */
45 find_bb_location (basic_block bb
)
48 gimple_stmt_iterator si
;
53 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
56 if (gimple_location (stmt
) != UNKNOWN_LOC
)
57 return gimple_location (stmt
);
64 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
67 vect_free_slp_tree (slp_tree node
)
75 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
76 vect_free_slp_tree (child
);
78 SLP_TREE_CHILDREN (node
).release ();
79 SLP_TREE_SCALAR_STMTS (node
).release ();
80 SLP_TREE_VEC_STMTS (node
).release ();
86 /* Free the memory allocated for the SLP instance. */
89 vect_free_slp_instance (slp_instance instance
)
91 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
92 SLP_INSTANCE_LOAD_PERMUTATION (instance
).release ();
93 SLP_INSTANCE_LOADS (instance
).release ();
94 SLP_INSTANCE_BODY_COST_VEC (instance
).release ();
99 /* Create an SLP node for SCALAR_STMTS. */
102 vect_create_new_slp_node (vec
<gimple
> scalar_stmts
)
105 gimple stmt
= scalar_stmts
[0];
108 if (is_gimple_call (stmt
))
109 nops
= gimple_call_num_args (stmt
);
110 else if (is_gimple_assign (stmt
))
112 nops
= gimple_num_ops (stmt
) - 1;
113 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
119 node
= XNEW (struct _slp_tree
);
120 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
121 SLP_TREE_VEC_STMTS (node
).create (0);
122 SLP_TREE_CHILDREN (node
).create (nops
);
128 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
130 static vec
<slp_oprnd_info
>
131 vect_create_oprnd_info (int nops
, int group_size
)
134 slp_oprnd_info oprnd_info
;
135 vec
<slp_oprnd_info
> oprnds_info
;
137 oprnds_info
.create (nops
);
138 for (i
= 0; i
< nops
; i
++)
140 oprnd_info
= XNEW (struct _slp_oprnd_info
);
141 oprnd_info
->def_stmts
.create (group_size
);
142 oprnd_info
->first_dt
= vect_uninitialized_def
;
143 oprnd_info
->first_def_type
= NULL_TREE
;
144 oprnd_info
->first_const_oprnd
= NULL_TREE
;
145 oprnd_info
->first_pattern
= false;
146 oprnds_info
.quick_push (oprnd_info
);
153 /* Free operands info. */
156 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
159 slp_oprnd_info oprnd_info
;
161 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
163 oprnd_info
->def_stmts
.release ();
164 XDELETE (oprnd_info
);
167 oprnds_info
.release ();
171 /* Find the place of the data-ref in STMT in the interleaving chain that starts
172 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
175 vect_get_place_in_interleaving_chain (gimple stmt
, gimple first_stmt
)
177 gimple next_stmt
= first_stmt
;
180 if (first_stmt
!= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
185 if (next_stmt
== stmt
)
188 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
196 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
197 they are of a valid type and that they match the defs of the first stmt of
198 the SLP group (stored in OPRNDS_INFO). */
201 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
202 gimple stmt
, bool first
,
203 vec
<slp_oprnd_info
> *oprnds_info
)
206 unsigned int i
, number_of_oprnds
;
209 enum vect_def_type dt
= vect_uninitialized_def
;
210 struct loop
*loop
= NULL
;
211 bool pattern
= false;
212 slp_oprnd_info oprnd_info
;
214 tree compare_rhs
= NULL_TREE
;
217 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
219 if (is_gimple_call (stmt
))
221 number_of_oprnds
= gimple_call_num_args (stmt
);
224 else if (is_gimple_assign (stmt
))
226 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
227 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
233 for (i
= 0; i
< number_of_oprnds
; i
++)
238 compare_rhs
= NULL_TREE
;
241 oprnd
= gimple_op (stmt
, op_idx
++);
243 oprnd_info
= (*oprnds_info
)[i
];
245 if (COMPARISON_CLASS_P (oprnd
))
247 compare_rhs
= TREE_OPERAND (oprnd
, 1);
248 oprnd
= TREE_OPERAND (oprnd
, 0);
251 if (!vect_is_simple_use (oprnd
, NULL
, loop_vinfo
, bb_vinfo
, &def_stmt
,
253 || (!def_stmt
&& dt
!= vect_constant_def
))
255 if (dump_enabled_p ())
257 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
258 "Build SLP failed: can't find def for ");
259 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
265 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
266 from the pattern. Check that all the stmts of the node are in the
268 if (def_stmt
&& gimple_bb (def_stmt
)
269 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (def_stmt
)))
270 || (!loop
&& gimple_bb (def_stmt
) == BB_VINFO_BB (bb_vinfo
)
271 && gimple_code (def_stmt
) != GIMPLE_PHI
))
272 && vinfo_for_stmt (def_stmt
)
273 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt
))
274 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt
))
275 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt
)))
278 if (!first
&& !oprnd_info
->first_pattern
)
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
283 "Build SLP failed: some of the stmts"
284 " are in a pattern, and others are not ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, oprnd
);
291 def_stmt
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt
));
292 dt
= STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt
));
294 if (dt
== vect_unknown_def_type
)
296 if (dump_enabled_p ())
297 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
298 "Unsupported pattern.");
302 switch (gimple_code (def_stmt
))
305 def
= gimple_phi_result (def_stmt
);
309 def
= gimple_assign_lhs (def_stmt
);
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
315 "unsupported defining stmt: ");
322 oprnd_info
->first_dt
= dt
;
323 oprnd_info
->first_pattern
= pattern
;
326 oprnd_info
->first_def_type
= TREE_TYPE (def
);
327 oprnd_info
->first_const_oprnd
= NULL_TREE
;
331 oprnd_info
->first_def_type
= NULL_TREE
;
332 oprnd_info
->first_const_oprnd
= oprnd
;
337 /* Not first stmt of the group, check that the def-stmt/s match
338 the def-stmt/s of the first stmt. Allow different definition
339 types for reduction chains: the first stmt must be a
340 vect_reduction_def (a phi node), and the rest
341 vect_internal_def. */
342 if (((oprnd_info
->first_dt
!= dt
343 && !(oprnd_info
->first_dt
== vect_reduction_def
344 && dt
== vect_internal_def
))
345 || (oprnd_info
->first_def_type
!= NULL_TREE
347 && !types_compatible_p (oprnd_info
->first_def_type
,
350 && !types_compatible_p (TREE_TYPE (oprnd_info
->first_const_oprnd
),
353 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
355 "Build SLP failed: different types ");
361 /* Check the types of the definitions. */
364 case vect_constant_def
:
365 case vect_external_def
:
366 case vect_reduction_def
:
369 case vect_internal_def
:
370 oprnd_info
->def_stmts
.quick_push (def_stmt
);
374 /* FORNOW: Not supported. */
375 if (dump_enabled_p ())
377 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
378 "Build SLP failed: illegal type of def ");
379 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, def
);
390 /* Recursively build an SLP tree starting from NODE.
391 Fail (and return FALSE) if def-stmts are not isomorphic, require data
392 permutation or are of unsupported types of operation. Otherwise, return
396 vect_build_slp_tree (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
397 slp_tree
*node
, unsigned int group_size
,
398 unsigned int *max_nunits
,
399 vec
<slp_tree
> *loads
,
400 unsigned int vectorization_factor
)
403 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (*node
);
404 gimple stmt
= stmts
[0];
405 enum tree_code first_stmt_code
= ERROR_MARK
, rhs_code
= ERROR_MARK
;
406 enum tree_code first_cond_code
= ERROR_MARK
;
408 bool stop_recursion
= false, need_same_oprnds
= false;
409 tree vectype
, scalar_type
, first_op1
= NULL_TREE
;
412 enum machine_mode optab_op2_mode
;
413 enum machine_mode vec_mode
;
414 struct data_reference
*first_dr
;
416 gimple first_load
= NULL
, prev_first_load
= NULL
, old_first_load
= NULL
;
417 vec
<slp_oprnd_info
> oprnds_info
;
419 slp_oprnd_info oprnd_info
;
422 if (is_gimple_call (stmt
))
423 nops
= gimple_call_num_args (stmt
);
424 else if (is_gimple_assign (stmt
))
426 nops
= gimple_num_ops (stmt
) - 1;
427 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
433 oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
435 /* For every stmt in NODE find its def stmt/s. */
436 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
438 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for ");
441 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
444 /* Fail to vectorize statements marked as unvectorizable. */
445 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt
)))
447 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
450 "Build SLP failed: unvectorizable statement ");
451 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
454 vect_free_oprnd_info (oprnds_info
);
458 lhs
= gimple_get_lhs (stmt
);
459 if (lhs
== NULL_TREE
)
461 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
464 "Build SLP failed: not GIMPLE_ASSIGN nor "
466 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
469 vect_free_oprnd_info (oprnds_info
);
473 if (is_gimple_assign (stmt
)
474 && gimple_assign_rhs_code (stmt
) == COND_EXPR
475 && (cond
= gimple_assign_rhs1 (stmt
))
476 && !COMPARISON_CLASS_P (cond
))
478 if (dump_enabled_p ())
480 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
481 "Build SLP failed: condition is not "
483 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
486 vect_free_oprnd_info (oprnds_info
);
490 scalar_type
= vect_get_smallest_scalar_type (stmt
, &dummy
, &dummy
);
491 vectype
= get_vectype_for_scalar_type (scalar_type
);
494 if (dump_enabled_p ())
496 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
497 "Build SLP failed: unsupported data-type ");
498 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
502 vect_free_oprnd_info (oprnds_info
);
506 /* In case of multiple types we need to detect the smallest type. */
507 if (*max_nunits
< TYPE_VECTOR_SUBPARTS (vectype
))
509 *max_nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
511 vectorization_factor
= *max_nunits
;
514 if (is_gimple_call (stmt
))
516 rhs_code
= CALL_EXPR
;
517 if (gimple_call_internal_p (stmt
)
518 || gimple_call_tail_p (stmt
)
519 || gimple_call_noreturn_p (stmt
)
520 || !gimple_call_nothrow_p (stmt
)
521 || gimple_call_chain (stmt
))
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
526 "Build SLP failed: unsupported call type ");
527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
530 vect_free_oprnd_info (oprnds_info
);
535 rhs_code
= gimple_assign_rhs_code (stmt
);
537 /* Check the operation. */
540 first_stmt_code
= rhs_code
;
542 /* Shift arguments should be equal in all the packed stmts for a
543 vector shift with scalar shift operand. */
544 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
545 || rhs_code
== LROTATE_EXPR
546 || rhs_code
== RROTATE_EXPR
)
548 vec_mode
= TYPE_MODE (vectype
);
550 /* First see if we have a vector/vector shift. */
551 optab
= optab_for_tree_code (rhs_code
, vectype
,
555 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
557 /* No vector/vector shift, try for a vector/scalar shift. */
558 optab
= optab_for_tree_code (rhs_code
, vectype
,
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
565 "Build SLP failed: no optab.");
566 vect_free_oprnd_info (oprnds_info
);
569 icode
= (int) optab_handler (optab
, vec_mode
);
570 if (icode
== CODE_FOR_nothing
)
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
575 "op not supported by target.");
576 vect_free_oprnd_info (oprnds_info
);
579 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
580 if (!VECTOR_MODE_P (optab_op2_mode
))
582 need_same_oprnds
= true;
583 first_op1
= gimple_assign_rhs2 (stmt
);
587 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
589 need_same_oprnds
= true;
590 first_op1
= gimple_assign_rhs2 (stmt
);
595 if (first_stmt_code
!= rhs_code
596 && (first_stmt_code
!= IMAGPART_EXPR
597 || rhs_code
!= REALPART_EXPR
)
598 && (first_stmt_code
!= REALPART_EXPR
599 || rhs_code
!= IMAGPART_EXPR
)
600 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
))
601 && (first_stmt_code
== ARRAY_REF
602 || first_stmt_code
== BIT_FIELD_REF
603 || first_stmt_code
== INDIRECT_REF
604 || first_stmt_code
== COMPONENT_REF
605 || first_stmt_code
== MEM_REF
)))
607 if (dump_enabled_p ())
609 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
610 "Build SLP failed: different operation "
612 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
615 vect_free_oprnd_info (oprnds_info
);
620 && !operand_equal_p (first_op1
, gimple_assign_rhs2 (stmt
), 0))
622 if (dump_enabled_p ())
624 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
625 "Build SLP failed: different shift "
627 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
630 vect_free_oprnd_info (oprnds_info
);
634 if (rhs_code
== CALL_EXPR
)
636 gimple first_stmt
= stmts
[0];
637 if (gimple_call_num_args (stmt
) != nops
638 || !operand_equal_p (gimple_call_fn (first_stmt
),
639 gimple_call_fn (stmt
), 0)
640 || gimple_call_fntype (first_stmt
)
641 != gimple_call_fntype (stmt
))
643 if (dump_enabled_p ())
645 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
646 "Build SLP failed: different calls in ");
647 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
651 vect_free_oprnd_info (oprnds_info
);
657 /* Grouped store or load. */
658 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt
)))
660 if (REFERENCE_CLASS_P (lhs
))
663 if (!vect_get_and_check_slp_defs (loop_vinfo
, bb_vinfo
,
664 stmt
, (i
== 0), &oprnds_info
))
666 vect_free_oprnd_info (oprnds_info
);
673 unsigned unrolling_factor
674 = least_common_multiple
675 (*max_nunits
, group_size
) / group_size
;
676 /* FORNOW: Check that there is no gap between the loads
677 and no gap between the groups when we need to load
678 multiple groups at once.
679 ??? We should enhance this to only disallow gaps
681 if ((unrolling_factor
> 1
682 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
683 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
684 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) != stmt
685 && GROUP_GAP (vinfo_for_stmt (stmt
)) != 1))
687 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
690 "Build SLP failed: grouped "
692 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
696 vect_free_oprnd_info (oprnds_info
);
700 /* Check that the size of interleaved loads group is not
701 greater than the SLP group size. */
703 = vectorization_factor
/ TYPE_VECTOR_SUBPARTS (vectype
);
705 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
706 && ((GROUP_SIZE (vinfo_for_stmt (stmt
))
707 - GROUP_GAP (vinfo_for_stmt (stmt
)))
708 > ncopies
* group_size
))
710 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
713 "Build SLP failed: the number "
714 "of interleaved loads is greater than "
715 "the SLP group size ");
716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
720 vect_free_oprnd_info (oprnds_info
);
724 old_first_load
= first_load
;
725 first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
728 /* Check that there are no loads from different interleaving
729 chains in the same node. The only exception is complex
731 if (prev_first_load
!= first_load
732 && rhs_code
!= REALPART_EXPR
733 && rhs_code
!= IMAGPART_EXPR
)
735 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
739 "Build SLP failed: different "
740 "interleaving chains in one node ");
741 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
745 vect_free_oprnd_info (oprnds_info
);
750 prev_first_load
= first_load
;
752 /* In some cases a group of loads is just the same load
753 repeated N times. Only analyze its cost once. */
754 if (first_load
== stmt
&& old_first_load
!= first_load
)
756 first_dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
757 if (vect_supportable_dr_alignment (first_dr
, false)
758 == dr_unaligned_unsupported
)
760 if (dump_enabled_p ())
762 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
764 "Build SLP failed: unsupported "
766 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
770 vect_free_oprnd_info (oprnds_info
);
775 /* We stop the tree when we reach a group of loads. */
776 stop_recursion
= true;
779 } /* Grouped access. */
782 if (TREE_CODE_CLASS (rhs_code
) == tcc_reference
)
784 /* Not grouped load. */
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
788 "Build SLP failed: not grouped load ");
789 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
792 /* FORNOW: Not grouped loads are not supported. */
793 vect_free_oprnd_info (oprnds_info
);
797 /* Not memory operation. */
798 if (TREE_CODE_CLASS (rhs_code
) != tcc_binary
799 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
800 && rhs_code
!= COND_EXPR
801 && rhs_code
!= CALL_EXPR
)
803 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
806 "Build SLP failed: operation");
807 dump_printf (MSG_MISSED_OPTIMIZATION
, " unsupported ");
808 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
811 vect_free_oprnd_info (oprnds_info
);
815 if (rhs_code
== COND_EXPR
)
817 tree cond_expr
= gimple_assign_rhs1 (stmt
);
820 first_cond_code
= TREE_CODE (cond_expr
);
821 else if (first_cond_code
!= TREE_CODE (cond_expr
))
823 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
826 "Build SLP failed: different"
828 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
832 vect_free_oprnd_info (oprnds_info
);
837 /* Find the def-stmts. */
838 if (!vect_get_and_check_slp_defs (loop_vinfo
, bb_vinfo
, stmt
,
839 (i
== 0), &oprnds_info
))
841 vect_free_oprnd_info (oprnds_info
);
847 /* Grouped loads were reached - stop the recursion. */
850 loads
->safe_push (*node
);
851 vect_free_oprnd_info (oprnds_info
);
855 /* Create SLP_TREE nodes for the definition node/s. */
856 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
860 if (oprnd_info
->first_dt
!= vect_internal_def
)
863 child
= vect_create_new_slp_node (oprnd_info
->def_stmts
);
865 || !vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &child
, group_size
,
867 vectorization_factor
))
870 oprnd_info
->def_stmts
= vNULL
;
871 vect_free_slp_tree (child
);
872 vect_free_oprnd_info (oprnds_info
);
876 oprnd_info
->def_stmts
.create (0);
877 SLP_TREE_CHILDREN (*node
).quick_push (child
);
880 vect_free_oprnd_info (oprnds_info
);
884 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
887 vect_print_slp_tree (int dump_kind
, slp_tree node
)
896 dump_printf (dump_kind
, "node ");
897 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
899 dump_printf (dump_kind
, "\n\tstmt %d ", i
);
900 dump_gimple_stmt (dump_kind
, TDF_SLIM
, stmt
, 0);
902 dump_printf (dump_kind
, "\n");
904 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
905 vect_print_slp_tree (dump_kind
, child
);
909 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
910 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
911 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
912 stmts in NODE are to be marked. */
915 vect_mark_slp_stmts (slp_tree node
, enum slp_vect_type mark
, int j
)
924 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
926 STMT_SLP_TYPE (vinfo_for_stmt (stmt
)) = mark
;
928 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
929 vect_mark_slp_stmts (child
, mark
, j
);
933 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
936 vect_mark_slp_stmts_relevant (slp_tree node
)
940 stmt_vec_info stmt_info
;
946 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
948 stmt_info
= vinfo_for_stmt (stmt
);
949 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
950 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
951 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
954 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
955 vect_mark_slp_stmts_relevant (child
);
959 /* Check if the permutation required by the SLP INSTANCE is supported.
960 Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
963 vect_supported_slp_permutation_p (slp_instance instance
)
965 slp_tree node
= SLP_INSTANCE_LOADS (instance
)[0];
966 gimple stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
967 gimple first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
968 vec
<slp_tree
> sorted_loads
= vNULL
;
970 slp_tree
*tmp_loads
= NULL
;
971 int group_size
= SLP_INSTANCE_GROUP_SIZE (instance
), i
, j
;
974 /* FORNOW: The only supported loads permutation is loads from the same
975 location in all the loads in the node, when the data-refs in
976 nodes of LOADS constitute an interleaving chain.
977 Sort the nodes according to the order of accesses in the chain. */
978 tmp_loads
= (slp_tree
*) xmalloc (sizeof (slp_tree
) * group_size
);
980 SLP_INSTANCE_LOAD_PERMUTATION (instance
).iterate (i
, &index
)
981 && SLP_INSTANCE_LOADS (instance
).iterate (j
, &load
);
982 i
+= group_size
, j
++)
984 gimple scalar_stmt
= SLP_TREE_SCALAR_STMTS (load
)[0];
985 /* Check that the loads are all in the same interleaving chain. */
986 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt
)) != first_load
)
988 if (dump_enabled_p ())
990 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
991 "Build SLP failed: unsupported data "
993 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1001 tmp_loads
[index
] = load
;
1004 sorted_loads
.create (group_size
);
1005 for (i
= 0; i
< group_size
; i
++)
1006 sorted_loads
.safe_push (tmp_loads
[i
]);
1008 SLP_INSTANCE_LOADS (instance
).release ();
1009 SLP_INSTANCE_LOADS (instance
) = sorted_loads
;
1012 if (!vect_transform_slp_perm_load (stmt
, vNULL
, NULL
,
1013 SLP_INSTANCE_UNROLLING_FACTOR (instance
),
1021 /* Rearrange the statements of NODE according to PERMUTATION. */
1024 vect_slp_rearrange_stmts (slp_tree node
, unsigned int group_size
,
1025 vec
<int> permutation
)
1028 vec
<gimple
> tmp_stmts
;
1032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1033 vect_slp_rearrange_stmts (child
, group_size
, permutation
);
1035 gcc_assert (group_size
== SLP_TREE_SCALAR_STMTS (node
).length ());
1036 tmp_stmts
.create (group_size
);
1037 tmp_stmts
.quick_grow_cleared (group_size
);
1039 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1040 tmp_stmts
[permutation
[i
]] = stmt
;
1042 SLP_TREE_SCALAR_STMTS (node
).release ();
1043 SLP_TREE_SCALAR_STMTS (node
) = tmp_stmts
;
1047 /* Check if the required load permutation is supported.
1048 LOAD_PERMUTATION contains a list of indices of the loads.
1049 In SLP this permutation is relative to the order of grouped stores that are
1050 the base of the SLP instance. */
1053 vect_supported_load_permutation_p (slp_instance slp_instn
, int group_size
,
1054 vec
<int> load_permutation
)
1056 int i
= 0, j
, prev
= -1, next
, k
, number_of_groups
;
1057 bool supported
, bad_permutation
= false;
1059 slp_tree node
, other_complex_node
;
1060 gimple stmt
, first
= NULL
, other_node_first
, load
, next_load
, first_load
;
1061 unsigned complex_numbers
= 0;
1062 struct data_reference
*dr
;
1063 bb_vec_info bb_vinfo
;
1065 /* FORNOW: permutations are only supported in SLP. */
1069 if (dump_enabled_p ())
1071 dump_printf_loc (MSG_NOTE
, vect_location
, "Load permutation ");
1072 FOR_EACH_VEC_ELT (load_permutation
, i
, next
)
1073 dump_printf (MSG_NOTE
, "%d ", next
);
1076 /* In case of reduction every load permutation is allowed, since the order
1077 of the reduction statements is not important (as opposed to the case of
1078 grouped stores). The only condition we need to check is that all the
1079 load nodes are of the same size and have the same permutation (and then
1080 rearrange all the nodes of the SLP instance according to this
1083 /* Check that all the load nodes are of the same size. */
1084 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1086 if (SLP_TREE_SCALAR_STMTS (node
).length () != (unsigned) group_size
)
1089 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1090 if (is_gimple_assign (stmt
)
1091 && (gimple_assign_rhs_code (stmt
) == REALPART_EXPR
1092 || gimple_assign_rhs_code (stmt
) == IMAGPART_EXPR
))
1096 /* Complex operands can be swapped as following:
1097 real_c = real_b + real_a;
1098 imag_c = imag_a + imag_b;
1099 i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of
1100 {real_a, imag_a} and {real_b, imag_b}. We check here that if interleaving
1101 chains are mixed, they match the above pattern. */
1102 if (complex_numbers
)
1104 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1106 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, stmt
)
1112 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) != first
)
1114 if (complex_numbers
!= 2)
1122 other_complex_node
= SLP_INSTANCE_LOADS (slp_instn
)[k
];
1124 SLP_TREE_SCALAR_STMTS (other_complex_node
)[0];
1126 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
1127 != other_node_first
)
1135 /* We checked that this case ok, so there is no need to proceed with
1136 permutation tests. */
1137 if (complex_numbers
== 2
1138 && SLP_INSTANCE_LOADS (slp_instn
).length () == 2)
1140 SLP_INSTANCE_LOADS (slp_instn
).release ();
1141 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn
).release ();
1145 node
= SLP_INSTANCE_TREE (slp_instn
);
1146 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1147 /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
1148 instance, not all the loads belong to the same node or interleaving
1149 group. Hence, we need to divide them into groups according to
1151 number_of_groups
= load_permutation
.length () / group_size
;
1153 /* Reduction (there are no data-refs in the root).
1154 In reduction chain the order of the loads is important. */
1155 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
))
1156 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1158 int first_group_load_index
;
1160 /* Compare all the permutation sequences to the first one. */
1161 for (i
= 1; i
< number_of_groups
; i
++)
1164 for (j
= i
* group_size
; j
< i
* group_size
+ group_size
; j
++)
1166 next
= load_permutation
[j
];
1167 first_group_load_index
= load_permutation
[k
];
1169 if (next
!= first_group_load_index
)
1171 bad_permutation
= true;
1178 if (bad_permutation
)
1182 if (!bad_permutation
)
1184 /* Check that the loads in the first sequence are different and there
1185 are no gaps between them. */
1186 load_index
= sbitmap_alloc (group_size
);
1187 bitmap_clear (load_index
);
1188 for (k
= 0; k
< group_size
; k
++)
1190 first_group_load_index
= load_permutation
[k
];
1191 if (bitmap_bit_p (load_index
, first_group_load_index
))
1193 bad_permutation
= true;
1197 bitmap_set_bit (load_index
, first_group_load_index
);
1200 if (!bad_permutation
)
1201 for (k
= 0; k
< group_size
; k
++)
1202 if (!bitmap_bit_p (load_index
, k
))
1204 bad_permutation
= true;
1208 sbitmap_free (load_index
);
1211 if (!bad_permutation
)
1213 /* This permutation is valid for reduction. Since the order of the
1214 statements in the nodes is not important unless they are memory
1215 accesses, we can rearrange the statements in all the nodes
1216 according to the order of the loads. */
1217 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn
), group_size
,
1219 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn
).release ();
1224 /* In basic block vectorization we allow any subchain of an interleaving
1226 FORNOW: not supported in loop SLP because of realignment compications. */
1227 bb_vinfo
= STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt
));
1228 bad_permutation
= false;
1229 /* Check that for every node in the instance the loads form a subchain. */
1232 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1236 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load
)
1239 first_load
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (load
));
1241 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load
)))
1243 bad_permutation
= true;
1247 if (j
!= 0 && next_load
!= load
)
1249 bad_permutation
= true;
1253 next_load
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (load
));
1256 if (bad_permutation
)
1260 /* Check that the alignment of the first load in every subchain, i.e.,
1261 the first statement in every load node, is supported. */
1262 if (!bad_permutation
)
1264 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn
), i
, node
)
1266 first_load
= SLP_TREE_SCALAR_STMTS (node
)[0];
1268 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load
)))
1270 dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load
));
1271 if (vect_supportable_dr_alignment (dr
, false)
1272 == dr_unaligned_unsupported
)
1274 if (dump_enabled_p ())
1276 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1278 "unsupported unaligned load ");
1279 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
1282 bad_permutation
= true;
1288 if (!bad_permutation
)
1290 SLP_INSTANCE_LOAD_PERMUTATION (slp_instn
).release ();
1296 /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1297 GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1298 well (unless it's reduction). */
1299 if (load_permutation
.length ()
1300 != (unsigned int) (group_size
* group_size
))
1304 load_index
= sbitmap_alloc (group_size
);
1305 bitmap_clear (load_index
);
1306 for (j
= 0; j
< group_size
; j
++)
1308 for (i
= j
* group_size
, k
= 0;
1309 load_permutation
.iterate (i
, &next
) && k
< group_size
;
1312 if (i
!= j
* group_size
&& next
!= prev
)
1321 if (bitmap_bit_p (load_index
, prev
))
1327 bitmap_set_bit (load_index
, prev
);
1330 for (j
= 0; j
< group_size
; j
++)
1331 if (!bitmap_bit_p (load_index
, j
))
1333 sbitmap_free (load_index
);
1337 sbitmap_free (load_index
);
1339 if (supported
&& i
== group_size
* group_size
1340 && vect_supported_slp_permutation_p (slp_instn
))
1347 /* Find the first load in the loop that belongs to INSTANCE.
1348 When loads are in several SLP nodes, there can be a case in which the first
1349 load does not appear in the first SLP node to be transformed, causing
1350 incorrect order of statements. Since we generate all the loads together,
1351 they must be inserted before the first load of the SLP instance and not
1352 before the first load of the first node of the instance. */
1355 vect_find_first_load_in_slp_instance (slp_instance instance
)
1359 gimple first_load
= NULL
, load
;
1361 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load_node
)
1362 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1363 first_load
= get_earlier_stmt (load
, first_load
);
1369 /* Find the last store in SLP INSTANCE. */
1372 vect_find_last_store_in_slp_instance (slp_instance instance
)
1376 gimple last_store
= NULL
, store
;
1378 node
= SLP_INSTANCE_TREE (instance
);
1379 for (i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &store
); i
++)
1380 last_store
= get_later_stmt (store
, last_store
);
1385 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1388 vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1389 slp_instance instance
, slp_tree node
,
1390 stmt_vector_for_cost
*prologue_cost_vec
,
1391 unsigned ncopies_for_cost
)
1393 stmt_vector_for_cost
*body_cost_vec
= &SLP_INSTANCE_BODY_COST_VEC (instance
);
1398 stmt_vec_info stmt_info
;
1400 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1402 /* Recurse down the SLP tree. */
1403 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1404 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1405 instance
, child
, prologue_cost_vec
,
1408 /* Look at the first scalar stmt to determine the cost. */
1409 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
1410 stmt_info
= vinfo_for_stmt (stmt
);
1411 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1413 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1414 vect_model_store_cost (stmt_info
, ncopies_for_cost
, false,
1415 vect_uninitialized_def
,
1416 node
, prologue_cost_vec
, body_cost_vec
);
1420 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)));
1421 vect_model_load_cost (stmt_info
, ncopies_for_cost
, false,
1422 node
, prologue_cost_vec
, body_cost_vec
);
1423 /* If the load is permuted record the cost for the permutation.
1424 ??? Loads from multiple chains are let through here only
1425 for a single special case involving complex numbers where
1426 in the end no permutation is necessary. */
1427 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, s
)
1428 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s
))
1429 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
))
1430 && vect_get_place_in_interleaving_chain
1431 (s
, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
)) != i
)
1433 record_stmt_cost (body_cost_vec
, group_size
, vec_perm
,
1434 stmt_info
, 0, vect_body
);
1440 record_stmt_cost (body_cost_vec
, ncopies_for_cost
, vector_stmt
,
1441 stmt_info
, 0, vect_body
);
1443 /* Scan operands and account for prologue cost of constants/externals.
1444 ??? This over-estimates cost for multiple uses and should be
1446 lhs
= gimple_get_lhs (stmt
);
1447 for (i
= 0; i
< gimple_num_ops (stmt
); ++i
)
1449 tree def
, op
= gimple_op (stmt
, i
);
1451 enum vect_def_type dt
;
1452 if (!op
|| op
== lhs
)
1454 if (vect_is_simple_use (op
, NULL
, loop_vinfo
, bb_vinfo
,
1455 &def_stmt
, &def
, &dt
)
1456 && (dt
== vect_constant_def
|| dt
== vect_external_def
))
1457 record_stmt_cost (prologue_cost_vec
, 1, vector_stmt
,
1458 stmt_info
, 0, vect_prologue
);
1462 /* Compute the cost for the SLP instance INSTANCE. */
1465 vect_analyze_slp_cost (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1466 slp_instance instance
, unsigned nunits
)
1468 stmt_vector_for_cost body_cost_vec
, prologue_cost_vec
;
1469 unsigned ncopies_for_cost
;
1470 stmt_info_for_cost
*si
;
1473 /* Calculate the number of vector stmts to create based on the unrolling
1474 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1475 GROUP_SIZE / NUNITS otherwise. */
1476 unsigned group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
1477 ncopies_for_cost
= least_common_multiple (nunits
, group_size
) / nunits
;
1479 prologue_cost_vec
.create (10);
1480 body_cost_vec
.create (10);
1481 SLP_INSTANCE_BODY_COST_VEC (instance
) = body_cost_vec
;
1482 vect_analyze_slp_cost_1 (loop_vinfo
, bb_vinfo
,
1483 instance
, SLP_INSTANCE_TREE (instance
),
1484 &prologue_cost_vec
, ncopies_for_cost
);
1486 /* Record the prologue costs, which were delayed until we were
1487 sure that SLP was successful. Unlike the body costs, we know
1488 the final values now regardless of the loop vectorization factor. */
1489 void *data
= (loop_vinfo
? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
)
1490 : BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1491 FOR_EACH_VEC_ELT (prologue_cost_vec
, i
, si
)
1493 struct _stmt_vec_info
*stmt_info
1494 = si
->stmt
? vinfo_for_stmt (si
->stmt
) : NULL
;
1495 (void) add_stmt_cost (data
, si
->count
, si
->kind
, stmt_info
,
1496 si
->misalign
, vect_prologue
);
1499 prologue_cost_vec
.release ();
1502 /* Analyze an SLP instance starting from a group of grouped stores. Call
1503 vect_build_slp_tree to build a tree of packed stmts if possible.
1504 Return FALSE if it's impossible to SLP any stmt in the loop. */
1507 vect_analyze_slp_instance (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
,
1510 slp_instance new_instance
;
1512 unsigned int group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1513 unsigned int unrolling_factor
= 1, nunits
;
1514 tree vectype
, scalar_type
= NULL_TREE
;
1516 unsigned int vectorization_factor
= 0;
1518 unsigned int max_nunits
= 0;
1519 vec
<slp_tree
> loads
;
1520 struct data_reference
*dr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt
));
1521 vec
<gimple
> scalar_stmts
;
1523 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1527 scalar_type
= TREE_TYPE (DR_REF (dr
));
1528 vectype
= get_vectype_for_scalar_type (scalar_type
);
1532 gcc_assert (loop_vinfo
);
1533 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1536 group_size
= GROUP_SIZE (vinfo_for_stmt (stmt
));
1540 gcc_assert (loop_vinfo
);
1541 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1542 group_size
= LOOP_VINFO_REDUCTIONS (loop_vinfo
).length ();
1547 if (dump_enabled_p ())
1549 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1550 "Build SLP failed: unsupported data-type ");
1551 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, scalar_type
);
1557 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
1559 vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1561 vectorization_factor
= nunits
;
1563 /* Calculate the unrolling factor. */
1564 unrolling_factor
= least_common_multiple (nunits
, group_size
) / group_size
;
1565 if (unrolling_factor
!= 1 && !loop_vinfo
)
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1569 "Build SLP failed: unrolling required in basic"
1575 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1576 scalar_stmts
.create (group_size
);
1578 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
1580 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1583 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next
))
1584 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)))
1585 scalar_stmts
.safe_push (
1586 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next
)));
1588 scalar_stmts
.safe_push (next
);
1589 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
1594 /* Collect reduction statements. */
1595 vec
<gimple
> reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1596 for (i
= 0; reductions
.iterate (i
, &next
); i
++)
1597 scalar_stmts
.safe_push (next
);
1600 node
= vect_create_new_slp_node (scalar_stmts
);
1602 loads
.create (group_size
);
1604 /* Build the tree for the SLP instance. */
1605 if (vect_build_slp_tree (loop_vinfo
, bb_vinfo
, &node
, group_size
,
1606 &max_nunits
, &loads
,
1607 vectorization_factor
))
1609 /* Calculate the unrolling factor based on the smallest type. */
1610 if (max_nunits
> nunits
)
1611 unrolling_factor
= least_common_multiple (max_nunits
, group_size
)
1614 if (unrolling_factor
!= 1 && !loop_vinfo
)
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1618 "Build SLP failed: unrolling required in basic"
1620 vect_free_slp_tree (node
);
1625 /* Create a new SLP instance. */
1626 new_instance
= XNEW (struct _slp_instance
);
1627 SLP_INSTANCE_TREE (new_instance
) = node
;
1628 SLP_INSTANCE_GROUP_SIZE (new_instance
) = group_size
;
1629 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
1630 SLP_INSTANCE_BODY_COST_VEC (new_instance
) = vNULL
;
1631 SLP_INSTANCE_LOADS (new_instance
) = loads
;
1632 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
) = NULL
;
1633 SLP_INSTANCE_LOAD_PERMUTATION (new_instance
) = vNULL
;
1635 /* Compute the load permutation. */
1637 bool loads_permuted
= false;
1638 vec
<int> load_permutation
;
1639 load_permutation
.create (group_size
* group_size
);
1640 FOR_EACH_VEC_ELT (loads
, i
, load_node
)
1644 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), j
, load
)
1647 load_place
= vect_get_place_in_interleaving_chain
1648 (load
, GROUP_FIRST_ELEMENT (vinfo_for_stmt (load
)));
1650 /* ??? We allow loads from different groups to
1651 get to here for a special case handled in
1652 the permutation code. Make sure we get to that. */
1653 || (GROUP_FIRST_ELEMENT
1654 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node
)[0]))
1655 != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load
))))
1656 loads_permuted
= true;
1657 load_permutation
.safe_push (load_place
);
1663 SLP_INSTANCE_LOAD_PERMUTATION (new_instance
) = load_permutation
;
1664 if (!vect_supported_load_permutation_p (new_instance
, group_size
,
1667 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1670 "Build SLP failed: unsupported load "
1672 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
1675 vect_free_slp_instance (new_instance
);
1679 SLP_INSTANCE_FIRST_LOAD_STMT (new_instance
)
1680 = vect_find_first_load_in_slp_instance (new_instance
);
1683 load_permutation
.release ();
1685 /* Compute the costs of this SLP instance. */
1686 vect_analyze_slp_cost (loop_vinfo
, bb_vinfo
,
1687 new_instance
, TYPE_VECTOR_SUBPARTS (vectype
));
1690 LOOP_VINFO_SLP_INSTANCES (loop_vinfo
).safe_push (new_instance
);
1692 BB_VINFO_SLP_INSTANCES (bb_vinfo
).safe_push (new_instance
);
1694 if (dump_enabled_p ())
1695 vect_print_slp_tree (MSG_NOTE
, node
);
1700 /* Failed to SLP. */
1701 /* Free the allocated memory. */
1702 vect_free_slp_tree (node
);
1709 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1710 trees of packed scalar stmts if SLP is possible. */
1713 vect_analyze_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
1716 vec
<gimple
> grouped_stores
;
1717 vec
<gimple
> reductions
= vNULL
;
1718 vec
<gimple
> reduc_chains
= vNULL
;
1719 gimple first_element
;
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_analyze_slp ===");
1727 grouped_stores
= LOOP_VINFO_GROUPED_STORES (loop_vinfo
);
1728 reduc_chains
= LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
);
1729 reductions
= LOOP_VINFO_REDUCTIONS (loop_vinfo
);
1732 grouped_stores
= BB_VINFO_GROUPED_STORES (bb_vinfo
);
1734 /* Find SLP sequences starting from groups of grouped stores. */
1735 FOR_EACH_VEC_ELT (grouped_stores
, i
, first_element
)
1736 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
))
1739 if (bb_vinfo
&& !ok
)
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1743 "Failed to SLP the basic block.");
1749 && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo
).length () > 0)
1751 /* Find SLP sequences starting from reduction chains. */
1752 FOR_EACH_VEC_ELT (reduc_chains
, i
, first_element
)
1753 if (vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, first_element
))
1758 /* Don't try to vectorize SLP reductions if reduction chain was
1763 /* Find SLP sequences starting from groups of reductions. */
1764 if (loop_vinfo
&& LOOP_VINFO_REDUCTIONS (loop_vinfo
).length () > 1
1765 && vect_analyze_slp_instance (loop_vinfo
, bb_vinfo
, reductions
[0]))
1772 /* For each possible SLP instance decide whether to SLP it and calculate overall
1773 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1774 least one instance. */
1777 vect_make_slp_decision (loop_vec_info loop_vinfo
)
1779 unsigned int i
, unrolling_factor
= 1;
1780 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1781 slp_instance instance
;
1782 int decided_to_slp
= 0;
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_make_slp_decision ===");
1787 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1789 /* FORNOW: SLP if you can. */
1790 if (unrolling_factor
< SLP_INSTANCE_UNROLLING_FACTOR (instance
))
1791 unrolling_factor
= SLP_INSTANCE_UNROLLING_FACTOR (instance
);
1793 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1794 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1795 loop-based vectorization. Such stmts will be marked as HYBRID. */
1796 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
1800 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
1802 if (decided_to_slp
&& dump_enabled_p ())
1803 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
1804 "Decided to SLP %d instances. Unrolling factor %d",
1805 decided_to_slp
, unrolling_factor
);
1807 return (decided_to_slp
> 0);
1811 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1812 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1815 vect_detect_hybrid_slp_stmts (slp_tree node
)
1818 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (node
);
1819 gimple stmt
= stmts
[0];
1820 imm_use_iterator imm_iter
;
1822 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
1824 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_vinfo
);
1825 struct loop
*loop
= NULL
;
1826 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_vinfo
);
1827 basic_block bb
= NULL
;
1833 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1835 bb
= BB_VINFO_BB (bb_vinfo
);
1837 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1838 if (PURE_SLP_STMT (vinfo_for_stmt (stmt
))
1839 && TREE_CODE (gimple_op (stmt
, 0)) == SSA_NAME
)
1840 FOR_EACH_IMM_USE_STMT (use_stmt
, imm_iter
, gimple_op (stmt
, 0))
1841 if (gimple_bb (use_stmt
)
1842 && ((loop
&& flow_bb_inside_loop_p (loop
, gimple_bb (use_stmt
)))
1843 || bb
== gimple_bb (use_stmt
))
1844 && (stmt_vinfo
= vinfo_for_stmt (use_stmt
))
1845 && !STMT_SLP_TYPE (stmt_vinfo
)
1846 && (STMT_VINFO_RELEVANT (stmt_vinfo
)
1847 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo
)))
1848 && !(gimple_code (use_stmt
) == GIMPLE_PHI
1849 && STMT_VINFO_DEF_TYPE (stmt_vinfo
)
1850 == vect_reduction_def
))
1851 vect_mark_slp_stmts (node
, hybrid
, i
);
1853 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1854 vect_detect_hybrid_slp_stmts (child
);
1858 /* Find stmts that must be both vectorized and SLPed. */
1861 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
1864 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
1865 slp_instance instance
;
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_NOTE
, vect_location
, "=== vect_detect_hybrid_slp ===");
1870 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1871 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance
));
1875 /* Create and initialize a new bb_vec_info struct for BB, as well as
1876 stmt_vec_info structs for all the stmts in it. */
1879 new_bb_vec_info (basic_block bb
)
1881 bb_vec_info res
= NULL
;
1882 gimple_stmt_iterator gsi
;
1884 res
= (bb_vec_info
) xcalloc (1, sizeof (struct _bb_vec_info
));
1885 BB_VINFO_BB (res
) = bb
;
1887 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
1889 gimple stmt
= gsi_stmt (gsi
);
1890 gimple_set_uid (stmt
, 0);
1891 set_vinfo_for_stmt (stmt
, new_stmt_vec_info (stmt
, NULL
, res
));
1894 BB_VINFO_GROUPED_STORES (res
).create (10);
1895 BB_VINFO_SLP_INSTANCES (res
).create (2);
1896 BB_VINFO_TARGET_COST_DATA (res
) = init_cost (NULL
);
1903 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
1904 stmts in the basic block. */
1907 destroy_bb_vec_info (bb_vec_info bb_vinfo
)
1909 vec
<slp_instance
> slp_instances
;
1910 slp_instance instance
;
1912 gimple_stmt_iterator si
;
1918 bb
= BB_VINFO_BB (bb_vinfo
);
1920 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
1922 gimple stmt
= gsi_stmt (si
);
1923 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1926 /* Free stmt_vec_info. */
1927 free_stmt_vec_info (stmt
);
1930 free_data_refs (BB_VINFO_DATAREFS (bb_vinfo
));
1931 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo
));
1932 BB_VINFO_GROUPED_STORES (bb_vinfo
).release ();
1933 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
1934 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
1935 vect_free_slp_instance (instance
);
1936 BB_VINFO_SLP_INSTANCES (bb_vinfo
).release ();
1937 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo
));
1943 /* Analyze statements contained in SLP tree node after recursively analyzing
1944 the subtree. Return TRUE if the operations are supported. */
1947 vect_slp_analyze_node_operations (bb_vec_info bb_vinfo
, slp_tree node
)
1957 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
1958 if (!vect_slp_analyze_node_operations (bb_vinfo
, child
))
1961 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
1963 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1964 gcc_assert (stmt_info
);
1965 gcc_assert (PURE_SLP_STMT (stmt_info
));
1967 if (!vect_analyze_stmt (stmt
, &dummy
, node
))
1975 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
1976 operations are supported. */
1979 vect_slp_analyze_operations (bb_vec_info bb_vinfo
)
1981 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
1982 slp_instance instance
;
1985 for (i
= 0; slp_instances
.iterate (i
, &instance
); )
1987 if (!vect_slp_analyze_node_operations (bb_vinfo
,
1988 SLP_INSTANCE_TREE (instance
)))
1990 vect_free_slp_instance (instance
);
1991 slp_instances
.ordered_remove (i
);
1997 if (!slp_instances
.length ())
2003 /* Check if vectorization of the basic block is profitable. */
2006 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
)
2008 vec
<slp_instance
> slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2009 slp_instance instance
;
2011 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
2012 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
2013 unsigned int stmt_cost
;
2015 gimple_stmt_iterator si
;
2016 basic_block bb
= BB_VINFO_BB (bb_vinfo
);
2017 void *target_cost_data
= BB_VINFO_TARGET_COST_DATA (bb_vinfo
);
2018 stmt_vec_info stmt_info
= NULL
;
2019 stmt_vector_for_cost body_cost_vec
;
2020 stmt_info_for_cost
*ci
;
2022 /* Calculate vector costs. */
2023 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2025 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2027 FOR_EACH_VEC_ELT (body_cost_vec
, j
, ci
)
2029 stmt_info
= ci
->stmt
? vinfo_for_stmt (ci
->stmt
) : NULL
;
2030 (void) add_stmt_cost (target_cost_data
, ci
->count
, ci
->kind
,
2031 stmt_info
, ci
->misalign
, vect_body
);
2035 /* Calculate scalar cost. */
2036 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
2038 stmt
= gsi_stmt (si
);
2039 stmt_info
= vinfo_for_stmt (stmt
);
2041 if (!stmt_info
|| !STMT_VINFO_VECTORIZABLE (stmt_info
)
2042 || !PURE_SLP_STMT (stmt_info
))
2045 if (STMT_VINFO_DATA_REF (stmt_info
))
2047 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2048 stmt_cost
= vect_get_stmt_cost (scalar_load
);
2050 stmt_cost
= vect_get_stmt_cost (scalar_store
);
2053 stmt_cost
= vect_get_stmt_cost (scalar_stmt
);
2055 scalar_cost
+= stmt_cost
;
2058 /* Complete the target-specific cost calculation. */
2059 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo
), &vec_prologue_cost
,
2060 &vec_inside_cost
, &vec_epilogue_cost
);
2062 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
2064 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
2067 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
2069 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
2070 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
2071 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d", scalar_cost
);
2074 /* Vectorization is profitable if its cost is less than the cost of scalar
2076 if (vec_outside_cost
+ vec_inside_cost
>= scalar_cost
)
2082 /* Check if the basic block can be vectorized. */
2085 vect_slp_analyze_bb_1 (basic_block bb
)
2087 bb_vec_info bb_vinfo
;
2088 vec
<slp_instance
> slp_instances
;
2089 slp_instance instance
;
2093 bb_vinfo
= new_bb_vec_info (bb
);
2097 if (!vect_analyze_data_refs (NULL
, bb_vinfo
, &min_vf
))
2099 if (dump_enabled_p ())
2100 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2101 "not vectorized: unhandled data-ref in basic "
2104 destroy_bb_vec_info (bb_vinfo
);
2108 if (BB_VINFO_DATAREFS (bb_vinfo
).length () < 2)
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2112 "not vectorized: not enough data-refs in "
2115 destroy_bb_vec_info (bb_vinfo
);
2119 if (!vect_analyze_data_ref_accesses (NULL
, bb_vinfo
))
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2123 "not vectorized: unhandled data access in "
2126 destroy_bb_vec_info (bb_vinfo
);
2130 vect_pattern_recog (NULL
, bb_vinfo
);
2132 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo
))
2134 if (dump_enabled_p ())
2135 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2136 "not vectorized: unhandled data dependence "
2137 "in basic block.\n");
2139 destroy_bb_vec_info (bb_vinfo
);
2143 if (!vect_analyze_data_refs_alignment (NULL
, bb_vinfo
))
2145 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2147 "not vectorized: bad data alignment in basic "
2150 destroy_bb_vec_info (bb_vinfo
);
2154 /* Check the SLP opportunities in the basic block, analyze and build SLP
2156 if (!vect_analyze_slp (NULL
, bb_vinfo
))
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2160 "not vectorized: failed to find SLP opportunities "
2161 "in basic block.\n");
2163 destroy_bb_vec_info (bb_vinfo
);
2167 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
2169 /* Mark all the statements that we want to vectorize as pure SLP and
2171 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2173 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
), pure_slp
, -1);
2174 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
2177 if (!vect_verify_datarefs_alignment (NULL
, bb_vinfo
))
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2181 "not vectorized: unsupported alignment in basic "
2183 destroy_bb_vec_info (bb_vinfo
);
2187 if (!vect_slp_analyze_operations (bb_vinfo
))
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2191 "not vectorized: bad operation in basic block.\n");
2193 destroy_bb_vec_info (bb_vinfo
);
2197 /* Cost model: check if the vectorization is worthwhile. */
2198 if (flag_vect_cost_model
2199 && !vect_bb_vectorization_profitable_p (bb_vinfo
))
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2203 "not vectorized: vectorization is not "
2206 destroy_bb_vec_info (bb_vinfo
);
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE
, vect_location
,
2212 "Basic block will be vectorized using SLP\n");
2219 vect_slp_analyze_bb (basic_block bb
)
2221 bb_vec_info bb_vinfo
;
2223 gimple_stmt_iterator gsi
;
2224 unsigned int vector_sizes
;
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_NOTE
, vect_location
, "===vect_slp_analyze_bb===\n");
2229 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2231 gimple stmt
= gsi_stmt (gsi
);
2232 if (!is_gimple_debug (stmt
)
2233 && !gimple_nop_p (stmt
)
2234 && gimple_code (stmt
) != GIMPLE_LABEL
)
2238 if (insns
> PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB
))
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2242 "not vectorized: too many instructions in "
2248 /* Autodetect first vector size we try. */
2249 current_vector_size
= 0;
2250 vector_sizes
= targetm
.vectorize
.autovectorize_vector_sizes ();
2254 bb_vinfo
= vect_slp_analyze_bb_1 (bb
);
2258 destroy_bb_vec_info (bb_vinfo
);
2260 vector_sizes
&= ~current_vector_size
;
2261 if (vector_sizes
== 0
2262 || current_vector_size
== 0)
2265 /* Try the next biggest vector size. */
2266 current_vector_size
= 1 << floor_log2 (vector_sizes
);
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE
, vect_location
,
2269 "***** Re-trying analysis with "
2270 "vector size %d\n", current_vector_size
);
2275 /* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2276 the number of created vector stmts depends on the unrolling factor).
2277 However, the actual number of vector stmts for every SLP node depends on
2278 VF which is set later in vect_analyze_operations (). Hence, SLP costs
2279 should be updated. In this function we assume that the inside costs
2280 calculated in vect_model_xxx_cost are linear in ncopies. */
2283 vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo
)
2285 unsigned int i
, j
, vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2286 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
2287 slp_instance instance
;
2288 stmt_vector_for_cost body_cost_vec
;
2289 stmt_info_for_cost
*si
;
2290 void *data
= LOOP_VINFO_TARGET_COST_DATA (loop_vinfo
);
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE
, vect_location
,
2294 "=== vect_update_slp_costs_according_to_vf ===");
2296 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
2298 /* We assume that costs are linear in ncopies. */
2299 int ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (instance
);
2301 /* Record the instance's instructions in the target cost model.
2302 This was delayed until here because the count of instructions
2303 isn't known beforehand. */
2304 body_cost_vec
= SLP_INSTANCE_BODY_COST_VEC (instance
);
2306 FOR_EACH_VEC_ELT (body_cost_vec
, j
, si
)
2307 (void) add_stmt_cost (data
, si
->count
* ncopies
, si
->kind
,
2308 vinfo_for_stmt (si
->stmt
), si
->misalign
,
2314 /* For constant and loop invariant defs of SLP_NODE this function returns
2315 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2316 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2317 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2318 REDUC_INDEX is the index of the reduction operand in the statements, unless
2322 vect_get_constant_vectors (tree op
, slp_tree slp_node
,
2323 vec
<tree
> *vec_oprnds
,
2324 unsigned int op_num
, unsigned int number_of_vectors
,
2327 vec
<gimple
> stmts
= SLP_TREE_SCALAR_STMTS (slp_node
);
2328 gimple stmt
= stmts
[0];
2329 stmt_vec_info stmt_vinfo
= vinfo_for_stmt (stmt
);
2333 unsigned j
, number_of_places_left_in_vector
;
2336 int group_size
= stmts
.length ();
2337 unsigned int vec_num
, i
;
2338 unsigned number_of_copies
= 1;
2340 voprnds
.create (number_of_vectors
);
2341 bool constant_p
, is_store
;
2342 tree neutral_op
= NULL
;
2343 enum tree_code code
= gimple_expr_code (stmt
);
2346 gimple_seq ctor_seq
= NULL
;
2348 if (STMT_VINFO_DEF_TYPE (stmt_vinfo
) == vect_reduction_def
2349 && reduc_index
!= -1)
2351 op_num
= reduc_index
- 1;
2352 op
= gimple_op (stmt
, reduc_index
);
2353 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2354 we need either neutral operands or the original operands. See
2355 get_initial_def_for_reduction() for details. */
2358 case WIDEN_SUM_EXPR
:
2364 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2365 neutral_op
= build_real (TREE_TYPE (op
), dconst0
);
2367 neutral_op
= build_int_cst (TREE_TYPE (op
), 0);
2372 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op
)))
2373 neutral_op
= build_real (TREE_TYPE (op
), dconst1
);
2375 neutral_op
= build_int_cst (TREE_TYPE (op
), 1);
2380 neutral_op
= build_int_cst (TREE_TYPE (op
), -1);
2385 def_stmt
= SSA_NAME_DEF_STMT (op
);
2386 loop
= (gimple_bb (stmt
))->loop_father
;
2387 neutral_op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2388 loop_preheader_edge (loop
));
2396 if (STMT_VINFO_DATA_REF (stmt_vinfo
))
2399 op
= gimple_assign_rhs1 (stmt
);
2406 if (CONSTANT_CLASS_P (op
))
2411 vector_type
= get_vectype_for_scalar_type (TREE_TYPE (op
));
2412 gcc_assert (vector_type
);
2413 nunits
= TYPE_VECTOR_SUBPARTS (vector_type
);
2415 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2416 created vectors. It is greater than 1 if unrolling is performed.
2418 For example, we have two scalar operands, s1 and s2 (e.g., group of
2419 strided accesses of size two), while NUNITS is four (i.e., four scalars
2420 of this type can be packed in a vector). The output vector will contain
2421 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2424 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2425 containing the operands.
2427 For example, NUNITS is four as before, and the group size is 8
2428 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2429 {s5, s6, s7, s8}. */
2431 number_of_copies
= least_common_multiple (nunits
, group_size
) / group_size
;
2433 number_of_places_left_in_vector
= nunits
;
2434 elts
= XALLOCAVEC (tree
, nunits
);
2435 for (j
= 0; j
< number_of_copies
; j
++)
2437 for (i
= group_size
- 1; stmts
.iterate (i
, &stmt
); i
--)
2440 op
= gimple_assign_rhs1 (stmt
);
2446 if (op_num
== 0 || op_num
== 1)
2448 tree cond
= gimple_assign_rhs1 (stmt
);
2449 op
= TREE_OPERAND (cond
, op_num
);
2454 op
= gimple_assign_rhs2 (stmt
);
2456 op
= gimple_assign_rhs3 (stmt
);
2461 op
= gimple_call_arg (stmt
, op_num
);
2468 op
= gimple_op (stmt
, op_num
+ 1);
2469 /* Unlike the other binary operators, shifts/rotates have
2470 the shift count being int, instead of the same type as
2471 the lhs, so make sure the scalar is the right type if
2472 we are dealing with vectors of
2473 long long/long/short/char. */
2474 if (op_num
== 1 && constant_p
)
2475 op
= fold_convert (TREE_TYPE (vector_type
), op
);
2479 op
= gimple_op (stmt
, op_num
+ 1);
2484 if (reduc_index
!= -1)
2486 loop
= (gimple_bb (stmt
))->loop_father
;
2487 def_stmt
= SSA_NAME_DEF_STMT (op
);
2491 /* Get the def before the loop. In reduction chain we have only
2492 one initial value. */
2493 if ((j
!= (number_of_copies
- 1)
2494 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
))
2499 op
= PHI_ARG_DEF_FROM_EDGE (def_stmt
,
2500 loop_preheader_edge (loop
));
2503 /* Create 'vect_ = {op0,op1,...,opn}'. */
2504 number_of_places_left_in_vector
--;
2505 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
2509 op
= fold_unary (VIEW_CONVERT_EXPR
,
2510 TREE_TYPE (vector_type
), op
);
2511 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
2516 = make_ssa_name (TREE_TYPE (vector_type
), NULL
);
2518 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
2521 = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR
,
2522 new_temp
, op
, NULL_TREE
);
2523 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
2527 elts
[number_of_places_left_in_vector
] = op
;
2529 if (number_of_places_left_in_vector
== 0)
2531 number_of_places_left_in_vector
= nunits
;
2534 vec_cst
= build_vector (vector_type
, elts
);
2537 vec
<constructor_elt
, va_gc
> *v
;
2539 vec_alloc (v
, nunits
);
2540 for (k
= 0; k
< nunits
; ++k
)
2541 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, elts
[k
]);
2542 vec_cst
= build_constructor (vector_type
, v
);
2544 voprnds
.quick_push (vect_init_vector (stmt
, vec_cst
,
2545 vector_type
, NULL
));
2546 if (ctor_seq
!= NULL
)
2548 gimple init_stmt
= SSA_NAME_DEF_STMT (voprnds
.last ());
2549 gimple_stmt_iterator gsi
= gsi_for_stmt (init_stmt
);
2550 gsi_insert_seq_before_without_update (&gsi
, ctor_seq
,
2558 /* Since the vectors are created in the reverse order, we should invert
2560 vec_num
= voprnds
.length ();
2561 for (j
= vec_num
; j
!= 0; j
--)
2563 vop
= voprnds
[j
- 1];
2564 vec_oprnds
->quick_push (vop
);
2569 /* In case that VF is greater than the unrolling factor needed for the SLP
2570 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2571 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2572 to replicate the vectors. */
2573 while (number_of_vectors
> vec_oprnds
->length ())
2575 tree neutral_vec
= NULL
;
2580 neutral_vec
= build_vector_from_val (vector_type
, neutral_op
);
2582 vec_oprnds
->quick_push (neutral_vec
);
2586 for (i
= 0; vec_oprnds
->iterate (i
, &vop
) && i
< vec_num
; i
++)
2587 vec_oprnds
->quick_push (vop
);
2593 /* Get vectorized definitions from SLP_NODE that contains corresponding
2594 vectorized def-stmts. */
2597 vect_get_slp_vect_defs (slp_tree slp_node
, vec
<tree
> *vec_oprnds
)
2600 gimple vec_def_stmt
;
2603 gcc_assert (SLP_TREE_VEC_STMTS (slp_node
).exists ());
2605 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), i
, vec_def_stmt
)
2607 gcc_assert (vec_def_stmt
);
2608 vec_oprnd
= gimple_get_lhs (vec_def_stmt
);
2609 vec_oprnds
->quick_push (vec_oprnd
);
2614 /* Get vectorized definitions for SLP_NODE.
2615 If the scalar definitions are loop invariants or constants, collect them and
2616 call vect_get_constant_vectors() to create vector stmts.
2617 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2618 must be stored in the corresponding child of SLP_NODE, and we call
2619 vect_get_slp_vect_defs () to retrieve them. */
2622 vect_get_slp_defs (vec
<tree
> ops
, slp_tree slp_node
,
2623 vec
<vec
<tree
> > *vec_oprnds
, int reduc_index
)
2626 int number_of_vects
= 0, i
;
2627 unsigned int child_index
= 0;
2628 HOST_WIDE_INT lhs_size_unit
, rhs_size_unit
;
2629 slp_tree child
= NULL
;
2632 bool vectorized_defs
;
2634 first_stmt
= SLP_TREE_SCALAR_STMTS (slp_node
)[0];
2635 FOR_EACH_VEC_ELT (ops
, i
, oprnd
)
2637 /* For each operand we check if it has vectorized definitions in a child
2638 node or we need to create them (for invariants and constants). We
2639 check if the LHS of the first stmt of the next child matches OPRND.
2640 If it does, we found the correct child. Otherwise, we call
2641 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2642 to check this child node for the next operand. */
2643 vectorized_defs
= false;
2644 if (SLP_TREE_CHILDREN (slp_node
).length () > child_index
)
2646 child
= (slp_tree
) SLP_TREE_CHILDREN (slp_node
)[child_index
];
2648 /* We have to check both pattern and original def, if available. */
2649 gimple first_def
= SLP_TREE_SCALAR_STMTS (child
)[0];
2650 gimple related
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def
));
2652 if (operand_equal_p (oprnd
, gimple_get_lhs (first_def
), 0)
2654 && operand_equal_p (oprnd
, gimple_get_lhs (related
), 0)))
2656 /* The number of vector defs is determined by the number of
2657 vector statements in the node from which we get those
2659 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (child
);
2660 vectorized_defs
= true;
2665 if (!vectorized_defs
)
2669 number_of_vects
= SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
);
2670 /* Number of vector stmts was calculated according to LHS in
2671 vect_schedule_slp_instance (), fix it by replacing LHS with
2672 RHS, if necessary. See vect_get_smallest_scalar_type () for
2674 vect_get_smallest_scalar_type (first_stmt
, &lhs_size_unit
,
2676 if (rhs_size_unit
!= lhs_size_unit
)
2678 number_of_vects
*= rhs_size_unit
;
2679 number_of_vects
/= lhs_size_unit
;
2684 /* Allocate memory for vectorized defs. */
2686 vec_defs
.create (number_of_vects
);
2688 /* For reduction defs we call vect_get_constant_vectors (), since we are
2689 looking for initial loop invariant values. */
2690 if (vectorized_defs
&& reduc_index
== -1)
2691 /* The defs are already vectorized. */
2692 vect_get_slp_vect_defs (child
, &vec_defs
);
2694 /* Build vectors from scalar defs. */
2695 vect_get_constant_vectors (oprnd
, slp_node
, &vec_defs
, i
,
2696 number_of_vects
, reduc_index
);
2698 vec_oprnds
->quick_push (vec_defs
);
2700 /* For reductions, we only need initial values. */
2701 if (reduc_index
!= -1)
2707 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2708 building a vector of type MASK_TYPE from it) and two input vectors placed in
2709 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2710 shifting by STRIDE elements of DR_CHAIN for every copy.
2711 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2713 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2714 the created stmts must be inserted. */
2717 vect_create_mask_and_perm (gimple stmt
, gimple next_scalar_stmt
,
2718 tree mask
, int first_vec_indx
, int second_vec_indx
,
2719 gimple_stmt_iterator
*gsi
, slp_tree node
,
2720 tree vectype
, vec
<tree
> dr_chain
,
2721 int ncopies
, int vect_stmts_counter
)
2724 gimple perm_stmt
= NULL
;
2725 stmt_vec_info next_stmt_info
;
2727 tree first_vec
, second_vec
, data_ref
;
2729 stride
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
) / ncopies
;
2731 /* Initialize the vect stmts of NODE to properly insert the generated
2733 for (i
= SLP_TREE_VEC_STMTS (node
).length ();
2734 i
< (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
2735 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
2737 perm_dest
= vect_create_destination_var (gimple_assign_lhs (stmt
), vectype
);
2738 for (i
= 0; i
< ncopies
; i
++)
2740 first_vec
= dr_chain
[first_vec_indx
];
2741 second_vec
= dr_chain
[second_vec_indx
];
2743 /* Generate the permute statement. */
2744 perm_stmt
= gimple_build_assign_with_ops (VEC_PERM_EXPR
, perm_dest
,
2745 first_vec
, second_vec
, mask
);
2746 data_ref
= make_ssa_name (perm_dest
, perm_stmt
);
2747 gimple_set_lhs (perm_stmt
, data_ref
);
2748 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
2750 /* Store the vector statement in NODE. */
2751 SLP_TREE_VEC_STMTS (node
)[stride
* i
+ vect_stmts_counter
] = perm_stmt
;
2753 first_vec_indx
+= stride
;
2754 second_vec_indx
+= stride
;
2757 /* Mark the scalar stmt as vectorized. */
2758 next_stmt_info
= vinfo_for_stmt (next_scalar_stmt
);
2759 STMT_VINFO_VEC_STMT (next_stmt_info
) = perm_stmt
;
2763 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2764 return in CURRENT_MASK_ELEMENT its equivalent in target specific
2765 representation. Check that the mask is valid and return FALSE if not.
2766 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2767 the next vector, i.e., the current first vector is not needed. */
2770 vect_get_mask_element (gimple stmt
, int first_mask_element
, int m
,
2771 int mask_nunits
, bool only_one_vec
, int index
,
2772 unsigned char *mask
, int *current_mask_element
,
2773 bool *need_next_vector
, int *number_of_mask_fixes
,
2774 bool *mask_fixed
, bool *needs_first_vector
)
2778 /* Convert to target specific representation. */
2779 *current_mask_element
= first_mask_element
+ m
;
2780 /* Adjust the value in case it's a mask for second and third vectors. */
2781 *current_mask_element
-= mask_nunits
* (*number_of_mask_fixes
- 1);
2783 if (*current_mask_element
< mask_nunits
)
2784 *needs_first_vector
= true;
2786 /* We have only one input vector to permute but the mask accesses values in
2787 the next vector as well. */
2788 if (only_one_vec
&& *current_mask_element
>= mask_nunits
)
2790 if (dump_enabled_p ())
2792 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2793 "permutation requires at least two vectors ");
2794 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2800 /* The mask requires the next vector. */
2801 if (*current_mask_element
>= mask_nunits
* 2)
2803 if (*needs_first_vector
|| *mask_fixed
)
2805 /* We either need the first vector too or have already moved to the
2806 next vector. In both cases, this permutation needs three
2808 if (dump_enabled_p ())
2810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2811 "permutation requires at "
2812 "least three vectors ");
2813 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2819 /* We move to the next vector, dropping the first one and working with
2820 the second and the third - we need to adjust the values of the mask
2822 *current_mask_element
-= mask_nunits
* *number_of_mask_fixes
;
2824 for (i
= 0; i
< index
; i
++)
2825 mask
[i
] -= mask_nunits
* *number_of_mask_fixes
;
2827 (*number_of_mask_fixes
)++;
2831 *need_next_vector
= *mask_fixed
;
2833 /* This was the last element of this mask. Start a new one. */
2834 if (index
== mask_nunits
- 1)
2836 *number_of_mask_fixes
= 1;
2837 *mask_fixed
= false;
2838 *needs_first_vector
= false;
2845 /* Generate vector permute statements from a list of loads in DR_CHAIN.
2846 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
2847 permute statements for SLP_NODE_INSTANCE. */
2849 vect_transform_slp_perm_load (gimple stmt
, vec
<tree
> dr_chain
,
2850 gimple_stmt_iterator
*gsi
, int vf
,
2851 slp_instance slp_node_instance
, bool analyze_only
)
2853 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2854 tree mask_element_type
= NULL_TREE
, mask_type
;
2855 int i
, j
, k
, nunits
, vec_index
= 0, scalar_index
;
2857 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2858 gimple next_scalar_stmt
;
2859 int group_size
= SLP_INSTANCE_GROUP_SIZE (slp_node_instance
);
2860 int first_mask_element
;
2861 int index
, unroll_factor
, current_mask_element
, ncopies
;
2862 unsigned char *mask
;
2863 bool only_one_vec
= false, need_next_vector
= false;
2864 int first_vec_index
, second_vec_index
, orig_vec_stmts_num
, vect_stmts_counter
;
2865 int number_of_mask_fixes
= 1;
2866 bool mask_fixed
= false;
2867 bool needs_first_vector
= false;
2868 enum machine_mode mode
;
2870 mode
= TYPE_MODE (vectype
);
2872 if (!can_vec_perm_p (mode
, false, NULL
))
2874 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2877 "no vect permute for ");
2878 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2883 /* The generic VEC_PERM_EXPR code always uses an integral type of the
2884 same size as the vector element being permuted. */
2885 mask_element_type
= lang_hooks
.types
.type_for_mode
2886 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype
))), 1);
2887 mask_type
= get_vectype_for_scalar_type (mask_element_type
);
2888 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
2889 mask
= XALLOCAVEC (unsigned char, nunits
);
2890 unroll_factor
= SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
2892 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
2893 unrolling factor. */
2894 orig_vec_stmts_num
= group_size
*
2895 SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
) / nunits
;
2896 if (orig_vec_stmts_num
== 1)
2897 only_one_vec
= true;
2899 /* Number of copies is determined by the final vectorization factor
2900 relatively to SLP_NODE_INSTANCE unrolling factor. */
2901 ncopies
= vf
/ SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance
);
2903 /* Generate permutation masks for every NODE. Number of masks for each NODE
2904 is equal to GROUP_SIZE.
2905 E.g., we have a group of three nodes with three loads from the same
2906 location in each node, and the vector size is 4. I.e., we have a
2907 a0b0c0a1b1c1... sequence and we need to create the following vectors:
2908 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
2909 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
2912 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
2913 The last mask is illegal since we assume two operands for permute
2914 operation, and the mask element values can't be outside that range.
2915 Hence, the last mask must be converted into {2,5,5,5}.
2916 For the first two permutations we need the first and the second input
2917 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
2918 we need the second and the third vectors: {b1,c1,a2,b2} and
2921 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_node_instance
), i
, node
)
2925 vect_stmts_counter
= 0;
2927 first_vec_index
= vec_index
++;
2929 second_vec_index
= first_vec_index
;
2931 second_vec_index
= vec_index
++;
2933 for (j
= 0; j
< unroll_factor
; j
++)
2935 for (k
= 0; k
< group_size
; k
++)
2937 first_mask_element
= i
+ j
* group_size
;
2938 if (!vect_get_mask_element (stmt
, first_mask_element
, 0,
2939 nunits
, only_one_vec
, index
,
2940 mask
, ¤t_mask_element
,
2942 &number_of_mask_fixes
, &mask_fixed
,
2943 &needs_first_vector
))
2945 mask
[index
++] = current_mask_element
;
2947 if (index
== nunits
)
2949 tree mask_vec
, *mask_elts
;
2952 if (!can_vec_perm_p (mode
, false, mask
))
2954 if (dump_enabled_p ())
2956 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
2958 "unsupported vect permute { ");
2959 for (i
= 0; i
< nunits
; ++i
)
2960 dump_printf (MSG_MISSED_OPTIMIZATION
, "%d ",
2962 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
2967 mask_elts
= XALLOCAVEC (tree
, nunits
);
2968 for (l
= 0; l
< nunits
; ++l
)
2969 mask_elts
[l
] = build_int_cst (mask_element_type
, mask
[l
]);
2970 mask_vec
= build_vector (mask_type
, mask_elts
);
2975 if (need_next_vector
)
2977 first_vec_index
= second_vec_index
;
2978 second_vec_index
= vec_index
;
2982 = SLP_TREE_SCALAR_STMTS (node
)[scalar_index
++];
2984 vect_create_mask_and_perm (stmt
, next_scalar_stmt
,
2985 mask_vec
, first_vec_index
, second_vec_index
,
2986 gsi
, node
, vectype
, dr_chain
,
2987 ncopies
, vect_stmts_counter
++);
2999 /* Vectorize SLP instance tree in postorder. */
3002 vect_schedule_slp_instance (slp_tree node
, slp_instance instance
,
3003 unsigned int vectorization_factor
)
3006 bool grouped_store
, is_store
;
3007 gimple_stmt_iterator si
;
3008 stmt_vec_info stmt_info
;
3009 unsigned int vec_stmts_size
, nunits
, group_size
;
3012 slp_tree loads_node
;
3018 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3019 vect_schedule_slp_instance (child
, instance
, vectorization_factor
);
3021 stmt
= SLP_TREE_SCALAR_STMTS (node
)[0];
3022 stmt_info
= vinfo_for_stmt (stmt
);
3024 /* VECTYPE is the type of the destination. */
3025 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3026 nunits
= (unsigned int) TYPE_VECTOR_SUBPARTS (vectype
);
3027 group_size
= SLP_INSTANCE_GROUP_SIZE (instance
);
3029 /* For each SLP instance calculate number of vector stmts to be created
3030 for the scalar stmts in each node of the SLP tree. Number of vector
3031 elements in one vector iteration is the number of scalar elements in
3032 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3034 vec_stmts_size
= (vectorization_factor
* group_size
) / nunits
;
3036 /* In case of load permutation we have to allocate vectorized statements for
3037 all the nodes that participate in that permutation. */
3038 if (SLP_INSTANCE_LOAD_PERMUTATION (instance
).exists ())
3040 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, loads_node
)
3042 if (!SLP_TREE_VEC_STMTS (loads_node
).exists ())
3044 SLP_TREE_VEC_STMTS (loads_node
).create (vec_stmts_size
);
3045 SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node
) = vec_stmts_size
;
3050 if (!SLP_TREE_VEC_STMTS (node
).exists ())
3052 SLP_TREE_VEC_STMTS (node
).create (vec_stmts_size
);
3053 SLP_TREE_NUMBER_OF_VEC_STMTS (node
) = vec_stmts_size
;
3056 if (dump_enabled_p ())
3058 dump_printf_loc (MSG_NOTE
,vect_location
,
3059 "------>vectorizing SLP node starting from: ");
3060 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3063 /* Loads should be inserted before the first load. */
3064 if (SLP_INSTANCE_FIRST_LOAD_STMT (instance
)
3065 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3066 && !REFERENCE_CLASS_P (gimple_get_lhs (stmt
))
3067 && SLP_INSTANCE_LOAD_PERMUTATION (instance
).exists ())
3068 si
= gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance
));
3069 else if (is_pattern_stmt_p (stmt_info
))
3070 si
= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info
));
3072 si
= gsi_for_stmt (stmt
);
3074 /* Stores should be inserted just before the last store. */
3075 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3076 && REFERENCE_CLASS_P (gimple_get_lhs (stmt
)))
3078 gimple last_store
= vect_find_last_store_in_slp_instance (instance
);
3079 if (is_pattern_stmt_p (vinfo_for_stmt (last_store
)))
3080 last_store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store
));
3081 si
= gsi_for_stmt (last_store
);
3084 /* Mark the first element of the reduction chain as reduction to properly
3085 transform the node. In the analysis phase only the last element of the
3086 chain is marked as reduction. */
3087 if (GROUP_FIRST_ELEMENT (stmt_info
) && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3088 && GROUP_FIRST_ELEMENT (stmt_info
) == stmt
)
3090 STMT_VINFO_DEF_TYPE (stmt_info
) = vect_reduction_def
;
3091 STMT_VINFO_TYPE (stmt_info
) = reduc_vec_info_type
;
3094 is_store
= vect_transform_stmt (stmt
, &si
, &grouped_store
, node
, instance
);
3098 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3099 For loop vectorization this is done in vectorizable_call, but for SLP
3100 it needs to be deferred until end of vect_schedule_slp, because multiple
3101 SLP instances may refer to the same scalar stmt. */
3104 vect_remove_slp_scalar_calls (slp_tree node
)
3106 gimple stmt
, new_stmt
;
3107 gimple_stmt_iterator gsi
;
3111 stmt_vec_info stmt_info
;
3116 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3117 vect_remove_slp_scalar_calls (child
);
3119 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
3121 if (!is_gimple_call (stmt
) || gimple_bb (stmt
) == NULL
)
3123 stmt_info
= vinfo_for_stmt (stmt
);
3124 if (stmt_info
== NULL
3125 || is_pattern_stmt_p (stmt_info
)
3126 || !PURE_SLP_STMT (stmt_info
))
3128 lhs
= gimple_call_lhs (stmt
);
3129 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
3130 set_vinfo_for_stmt (new_stmt
, stmt_info
);
3131 set_vinfo_for_stmt (stmt
, NULL
);
3132 STMT_VINFO_STMT (stmt_info
) = new_stmt
;
3133 gsi
= gsi_for_stmt (stmt
);
3134 gsi_replace (&gsi
, new_stmt
, false);
3135 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
3139 /* Generate vector code for all SLP instances in the loop/basic block. */
3142 vect_schedule_slp (loop_vec_info loop_vinfo
, bb_vec_info bb_vinfo
)
3144 vec
<slp_instance
> slp_instances
;
3145 slp_instance instance
;
3146 slp_tree loads_node
;
3147 unsigned int i
, j
, vf
;
3148 bool is_store
= false;
3152 slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3153 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3157 slp_instances
= BB_VINFO_SLP_INSTANCES (bb_vinfo
);
3161 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3163 /* Schedule the tree of INSTANCE. */
3164 is_store
= vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance
),
3167 /* Clear STMT_VINFO_VEC_STMT of all loads. With shared loads
3168 between SLP instances we fail to properly initialize the
3169 vectorized SLP stmts and confuse different load permutations. */
3170 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, loads_node
)
3172 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node
)[0])) = NULL
;
3174 if (dump_enabled_p ())
3175 dump_printf_loc (MSG_NOTE
, vect_location
,
3176 "vectorizing stmts using SLP.");
3179 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3181 slp_tree root
= SLP_INSTANCE_TREE (instance
);
3184 gimple_stmt_iterator gsi
;
3186 /* Remove scalar call stmts. Do not do this for basic-block
3187 vectorization as not all uses may be vectorized.
3188 ??? Why should this be necessary? DCE should be able to
3189 remove the stmts itself.
3190 ??? For BB vectorization we can as well remove scalar
3191 stmts starting from the SLP tree root if they have no
3194 vect_remove_slp_scalar_calls (root
);
3196 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store
)
3197 && j
< SLP_INSTANCE_GROUP_SIZE (instance
); j
++)
3199 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store
)))
3202 if (is_pattern_stmt_p (vinfo_for_stmt (store
)))
3203 store
= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store
));
3204 /* Free the attached stmt_vec_info and remove the stmt. */
3205 gsi
= gsi_for_stmt (store
);
3206 unlink_stmt_vdef (store
);
3207 gsi_remove (&gsi
, true);
3208 release_defs (store
);
3209 free_stmt_vec_info (store
);
3217 /* Vectorize the basic block. */
3220 vect_slp_transform_bb (basic_block bb
)
3222 bb_vec_info bb_vinfo
= vec_info_for_bb (bb
);
3223 gimple_stmt_iterator si
;
3225 gcc_assert (bb_vinfo
);
3227 if (dump_enabled_p ())
3228 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB\n");
3230 for (si
= gsi_start_bb (bb
); !gsi_end_p (si
); gsi_next (&si
))
3232 gimple stmt
= gsi_stmt (si
);
3233 stmt_vec_info stmt_info
;
3235 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_NOTE
, vect_location
,
3238 "------>SLPing statement: ");
3239 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3242 stmt_info
= vinfo_for_stmt (stmt
);
3243 gcc_assert (stmt_info
);
3245 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3246 if (STMT_SLP_TYPE (stmt_info
))
3248 vect_schedule_slp (NULL
, bb_vinfo
);
3253 if (dump_enabled_p ())
3254 dump_printf (MSG_OPTIMIZED_LOCATIONS
, "BASIC BLOCK VECTORIZED\n");
3256 destroy_bb_vec_info (bb_vinfo
);