1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
33 #include "optabs-tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
58 tree vectype
, unsigned HOST_WIDE_INT count
)
60 machine_mode mode
, array_mode
;
63 mode
= TYPE_MODE (vectype
);
64 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
65 array_mode
= mode_for_size (count
* GET_MODE_BITSIZE (mode
),
68 if (array_mode
== BLKmode
)
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC
"]\n",
73 GET_MODE_NAME (mode
), count
);
77 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
81 "cannot use %s<%s><%s>\n", name
,
82 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE
, vect_location
,
88 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
89 GET_MODE_NAME (mode
));
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 vect_get_smallest_scalar_type (gimple
*stmt
, HOST_WIDE_INT
*lhs_size_unit
,
114 HOST_WIDE_INT
*rhs_size_unit
)
116 tree scalar_type
= gimple_expr_type (stmt
);
117 HOST_WIDE_INT lhs
, rhs
;
119 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
121 if (is_gimple_assign (stmt
)
122 && (gimple_assign_cast_p (stmt
)
123 || gimple_assign_rhs_code (stmt
) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt
) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt
) == FLOAT_EXPR
))
127 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (stmt
));
129 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
131 scalar_type
= rhs_type
;
134 *lhs_size_unit
= lhs
;
135 *rhs_size_unit
= rhs
;
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
145 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
147 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
) == 0)
152 if (dump_enabled_p ())
154 dump_printf_loc (MSG_NOTE
, vect_location
,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_A (ddr
)));
157 dump_printf (MSG_NOTE
, " and ");
158 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (DDR_B (ddr
)));
159 dump_printf (MSG_NOTE
, "\n");
162 if (optimize_loop_nest_for_size_p (loop
))
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
166 "versioning not supported when optimizing"
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
176 "versioning not yet supported for outer-loops.\n");
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 if (TREE_CODE (DR_STEP (DDR_A (ddr
))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr
))) != INTEGER_CST
)
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
187 "versioning not yet supported for non-constant "
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
197 /* Function vect_analyze_data_ref_dependence.
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
205 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
206 loop_vec_info loop_vinfo
, int *max_vf
)
209 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
210 struct data_reference
*dra
= DDR_A (ddr
);
211 struct data_reference
*drb
= DDR_B (ddr
);
212 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
213 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
214 lambda_vector dist_v
;
215 unsigned int loop_depth
;
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
227 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
237 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
239 get_alias_set (DR_REF (drb
))))
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop
->safelen
>= 2)
249 if (loop
->safelen
< *max_vf
)
250 *max_vf
= loop
->safelen
;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
258 if (dump_enabled_p ())
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
265 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
268 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
273 if (dump_enabled_p ())
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
280 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
283 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop
->safelen
>= 2)
297 if (loop
->safelen
< *max_vf
)
298 *max_vf
= loop
->safelen
;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
306 if (dump_enabled_p ())
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
313 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
316 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
321 if (dump_enabled_p ())
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
327 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
329 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
335 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
338 int dist
= dist_v
[loop_depth
];
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE
, vect_location
,
342 "dependence distance = %d.\n", dist
);
346 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE
, vect_location
,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
351 dump_printf (MSG_NOTE
, " and ");
352 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
353 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
377 gimple
*earlier_stmt
;
378 earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
384 "READ_WRITE dependence in interleaving."
393 if (dist
> 0 && DDR_REVERSED_P (ddr
))
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
412 && abs (dist
) < *max_vf
)
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf
= abs (dist
);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE
, vect_location
,
419 "adjusting maximal vectorization factor to %i\n",
423 if (abs (dist
) >= *max_vf
)
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE
, vect_location
,
429 "dependence distance >= VF.\n");
433 if (dump_enabled_p ())
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
439 dump_printf (MSG_NOTE
, " and ");
440 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
441 dump_printf (MSG_NOTE
, "\n");
450 /* Function vect_analyze_data_ref_dependences.
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
, int *max_vf
)
460 struct data_dependence_relation
*ddr
;
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE
, vect_location
,
464 "=== vect_analyze_data_ref_dependences ===\n");
466 LOOP_VINFO_DDRS (loop_vinfo
)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
471 &LOOP_VINFO_DDRS (loop_vinfo
),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo
), true))
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
476 if (vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
))
483 /* Function vect_slp_analyze_data_ref_dependence.
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
)
493 struct data_reference
*dra
= DDR_A (ddr
);
494 struct data_reference
*drb
= DDR_B (ddr
);
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
510 /* If dra and drb are part of the same interleaving chain consider
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra
)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra
)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb
)))))
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
520 if (dump_enabled_p ())
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (dra
));
525 dump_printf (MSG_MISSED_OPTIMIZATION
, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, DR_REF (drb
));
527 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
530 else if (dump_enabled_p ())
532 dump_printf_loc (MSG_NOTE
, vect_location
,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
535 dump_printf (MSG_NOTE
, " and ");
536 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
537 dump_printf (MSG_NOTE
, "\n");
540 /* We do not vectorize basic blocks with write-write dependencies. */
541 if (DR_IS_WRITE (dra
) && DR_IS_WRITE (drb
))
544 /* If we have a read-write dependence check that the load is before the store.
545 When we vectorize basic blocks, vector load can be only before
546 corresponding scalar load, and vector store can be only after its
547 corresponding scalar store. So the order of the acceses is preserved in
548 case the load is before the store. */
549 gimple
*earlier_stmt
= get_earlier_stmt (DR_STMT (dra
), DR_STMT (drb
));
550 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt
))))
552 /* That only holds for load-store pairs taking part in vectorization. */
553 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra
)))
554 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb
))))
562 /* Analyze dependences involved in the transform of SLP NODE. */
565 vect_slp_analyze_node_dependences (slp_instance instance
, slp_tree node
)
567 /* This walks over all stmts involved in the SLP load/store done
568 in NODE verifying we can sink them up to the last stmt in the
570 gimple
*last_access
= vect_find_last_scalar_stmt_in_slp (node
);
571 for (unsigned k
= 0; k
< SLP_INSTANCE_GROUP_SIZE (instance
); ++k
)
573 gimple
*access
= SLP_TREE_SCALAR_STMTS (node
)[k
];
574 if (access
== last_access
)
576 stmt_vec_info access_stmt_info
= vinfo_for_stmt (access
);
577 gimple_stmt_iterator gsi
= gsi_for_stmt (access
);
579 for (; gsi_stmt (gsi
) != last_access
; gsi_next (&gsi
))
581 gimple
*stmt
= gsi_stmt (gsi
);
582 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
583 if (!STMT_VINFO_DATA_REF (stmt_info
)
584 || (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
))
585 && DR_IS_READ (STMT_VINFO_DATA_REF (access_stmt_info
))))
588 ddr_p ddr
= initialize_data_dependence_relation
589 (STMT_VINFO_DATA_REF (access_stmt_info
),
590 STMT_VINFO_DATA_REF (stmt_info
), vNULL
);
591 if (vect_slp_analyze_data_ref_dependence (ddr
))
593 /* ??? If the dependence analysis failed we can resort to the
594 alias oracle which can handle more kinds of stmts. */
595 free_dependence_relation (ddr
);
598 free_dependence_relation (ddr
);
605 /* Function vect_analyze_data_ref_dependences.
607 Examine all the data references in the basic-block, and make sure there
608 do not exist any data dependences between them. Set *MAX_VF according to
609 the maximum vectorization factor the data dependences allow. */
612 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo
)
614 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE
, vect_location
,
616 "=== vect_slp_analyze_data_ref_dependences ===\n");
618 slp_instance instance
;
621 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
624 /* Verify we can sink loads to the vectorized stmt insert location. */
625 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), j
, load
)
626 if (! vect_slp_analyze_node_dependences (instance
, load
))
631 /* Verify we can sink stores to the vectorized stmt insert location. */
632 slp_tree store
= SLP_INSTANCE_TREE (instance
);
634 && STMT_VINFO_DATA_REF
635 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store
)[0]))
636 && ! vect_slp_analyze_node_dependences (instance
, store
))
640 dump_printf_loc (MSG_NOTE
, vect_location
,
641 "removing SLP instance operations starting from: ");
642 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
,
643 SLP_TREE_SCALAR_STMTS
644 (SLP_INSTANCE_TREE (instance
))[0], 0);
645 vect_free_slp_instance (instance
);
646 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
651 if (!BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
658 /* Function vect_compute_data_ref_alignment
660 Compute the misalignment of the data reference DR.
663 1. If during the misalignment computation it is found that the data reference
664 cannot be vectorized then false is returned.
665 2. DR_MISALIGNMENT (DR) is defined.
667 FOR NOW: No analysis is actually performed. Misalignment is calculated
668 only for trivial cases. TODO. */
671 vect_compute_data_ref_alignment (struct data_reference
*dr
)
673 gimple
*stmt
= DR_STMT (dr
);
674 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
675 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
676 struct loop
*loop
= NULL
;
677 tree ref
= DR_REF (dr
);
679 tree base
, base_addr
;
680 tree misalign
= NULL_TREE
;
682 unsigned HOST_WIDE_INT alignment
;
684 if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE
, vect_location
,
686 "vect_compute_data_ref_alignment:\n");
689 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
691 /* Initialize misalignment to unknown. */
692 SET_DR_MISALIGNMENT (dr
, -1);
694 if (tree_fits_shwi_p (DR_STEP (dr
)))
695 misalign
= DR_INIT (dr
);
696 aligned_to
= DR_ALIGNED_TO (dr
);
697 base_addr
= DR_BASE_ADDRESS (dr
);
698 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
700 /* In case the dataref is in an inner-loop of the loop that is being
701 vectorized (LOOP), we use the base and misalignment information
702 relative to the outer-loop (LOOP). This is ok only if the misalignment
703 stays the same throughout the execution of the inner-loop, which is why
704 we have to check that the stride of the dataref in the inner-loop evenly
705 divides by the vector size. */
706 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
708 tree step
= DR_STEP (dr
);
710 if (tree_fits_shwi_p (step
)
711 && tree_to_shwi (step
) % GET_MODE_SIZE (TYPE_MODE (vectype
)) == 0)
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE
, vect_location
,
715 "inner step divides the vector-size.\n");
716 misalign
= STMT_VINFO_DR_INIT (stmt_info
);
717 aligned_to
= STMT_VINFO_DR_ALIGNED_TO (stmt_info
);
718 base_addr
= STMT_VINFO_DR_BASE_ADDRESS (stmt_info
);
722 if (dump_enabled_p ())
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
724 "inner step doesn't divide the vector-size.\n");
725 misalign
= NULL_TREE
;
729 /* Similarly we can only use base and misalignment information relative to
730 an innermost loop if the misalignment stays the same throughout the
731 execution of the loop. As above, this is the case if the stride of
732 the dataref evenly divides by the vector size. */
735 tree step
= DR_STEP (dr
);
736 unsigned vf
= loop
? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) : 1;
738 if (tree_fits_shwi_p (step
)
739 && ((tree_to_shwi (step
) * vf
)
740 % GET_MODE_SIZE (TYPE_MODE (vectype
)) != 0))
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
744 "step doesn't divide the vector-size.\n");
745 misalign
= NULL_TREE
;
749 /* To look at alignment of the base we have to preserve an inner MEM_REF
750 as that carries alignment information of the actual access. */
752 while (handled_component_p (base
))
753 base
= TREE_OPERAND (base
, 0);
754 if (TREE_CODE (base
) == MEM_REF
)
755 base
= build2 (MEM_REF
, TREE_TYPE (base
), base_addr
,
756 build_int_cst (TREE_TYPE (TREE_OPERAND (base
, 1)), 0));
757 unsigned int base_alignment
= get_object_alignment (base
);
759 if (base_alignment
>= TYPE_ALIGN (TREE_TYPE (vectype
)))
760 DR_VECT_AUX (dr
)->base_element_aligned
= true;
762 alignment
= TYPE_ALIGN_UNIT (vectype
);
764 if ((compare_tree_int (aligned_to
, alignment
) < 0)
767 if (dump_enabled_p ())
769 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
770 "Unknown alignment for access: ");
771 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
772 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
777 if (base_alignment
< TYPE_ALIGN (vectype
))
779 /* Strip an inner MEM_REF to a bare decl if possible. */
780 if (TREE_CODE (base
) == MEM_REF
781 && integer_zerop (TREE_OPERAND (base
, 1))
782 && TREE_CODE (TREE_OPERAND (base
, 0)) == ADDR_EXPR
)
783 base
= TREE_OPERAND (TREE_OPERAND (base
, 0), 0);
785 if (!vect_can_force_dr_alignment_p (base
, TYPE_ALIGN (vectype
)))
787 if (dump_enabled_p ())
789 dump_printf_loc (MSG_NOTE
, vect_location
,
790 "can't force alignment of ref: ");
791 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
792 dump_printf (MSG_NOTE
, "\n");
797 /* Force the alignment of the decl.
798 NOTE: This is the only change to the code we make during
799 the analysis phase, before deciding to vectorize the loop. */
800 if (dump_enabled_p ())
802 dump_printf_loc (MSG_NOTE
, vect_location
, "force alignment of ");
803 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, ref
);
804 dump_printf (MSG_NOTE
, "\n");
807 DR_VECT_AUX (dr
)->base_decl
= base
;
808 DR_VECT_AUX (dr
)->base_misaligned
= true;
809 DR_VECT_AUX (dr
)->base_element_aligned
= true;
812 /* If this is a backward running DR then first access in the larger
813 vectype actually is N-1 elements before the address in the DR.
814 Adjust misalign accordingly. */
815 if (tree_int_cst_sgn (DR_STEP (dr
)) < 0)
817 tree offset
= ssize_int (TYPE_VECTOR_SUBPARTS (vectype
) - 1);
818 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
819 otherwise we wouldn't be here. */
820 offset
= fold_build2 (MULT_EXPR
, ssizetype
, offset
, DR_STEP (dr
));
821 /* PLUS because DR_STEP was negative. */
822 misalign
= size_binop (PLUS_EXPR
, misalign
, offset
);
825 SET_DR_MISALIGNMENT (dr
,
826 wi::mod_floor (misalign
, alignment
, SIGNED
).to_uhwi ());
828 if (dump_enabled_p ())
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
831 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr
));
832 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, ref
);
833 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
840 /* Function vect_compute_data_refs_alignment
842 Compute the misalignment of data references in the loop.
843 Return FALSE if a data reference is found that cannot be vectorized. */
846 vect_compute_data_refs_alignment (vec_info
*vinfo
)
848 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
849 struct data_reference
*dr
;
852 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
854 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
855 if (STMT_VINFO_VECTORIZABLE (stmt_info
)
856 && !vect_compute_data_ref_alignment (dr
))
858 /* Strided accesses perform only component accesses, misalignment
859 information is irrelevant for them. */
860 if (STMT_VINFO_STRIDED_P (stmt_info
)
861 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
864 if (is_a
<bb_vec_info
> (vinfo
))
866 /* Mark unsupported statement as unvectorizable. */
867 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
879 /* Function vect_update_misalignment_for_peel
881 DR - the data reference whose misalignment is to be adjusted.
882 DR_PEEL - the data reference whose misalignment is being made
883 zero in the vector loop by the peel.
884 NPEEL - the number of iterations in the peel loop if the misalignment
885 of DR_PEEL is known at compile time. */
888 vect_update_misalignment_for_peel (struct data_reference
*dr
,
889 struct data_reference
*dr_peel
, int npeel
)
892 vec
<dr_p
> same_align_drs
;
893 struct data_reference
*current_dr
;
894 int dr_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr
))));
895 int dr_peel_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel
))));
896 stmt_vec_info stmt_info
= vinfo_for_stmt (DR_STMT (dr
));
897 stmt_vec_info peel_stmt_info
= vinfo_for_stmt (DR_STMT (dr_peel
));
899 /* For interleaved data accesses the step in the loop must be multiplied by
900 the size of the interleaving group. */
901 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
902 dr_size
*= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info
)));
903 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info
))
904 dr_peel_size
*= GROUP_SIZE (peel_stmt_info
);
906 /* It can be assumed that the data refs with the same alignment as dr_peel
907 are aligned in the vector loop. */
909 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel
)));
910 FOR_EACH_VEC_ELT (same_align_drs
, i
, current_dr
)
912 if (current_dr
!= dr
)
914 gcc_assert (DR_MISALIGNMENT (dr
) / dr_size
==
915 DR_MISALIGNMENT (dr_peel
) / dr_peel_size
);
916 SET_DR_MISALIGNMENT (dr
, 0);
920 if (known_alignment_for_access_p (dr
)
921 && known_alignment_for_access_p (dr_peel
))
923 bool negative
= tree_int_cst_compare (DR_STEP (dr
), size_zero_node
) < 0;
924 int misal
= DR_MISALIGNMENT (dr
);
925 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
926 misal
+= negative
? -npeel
* dr_size
: npeel
* dr_size
;
927 misal
&= (TYPE_ALIGN (vectype
) / BITS_PER_UNIT
) - 1;
928 SET_DR_MISALIGNMENT (dr
, misal
);
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment to -1.\n");
934 SET_DR_MISALIGNMENT (dr
, -1);
938 /* Function vect_verify_datarefs_alignment
940 Return TRUE if all data references in the loop can be
941 handled with respect to alignment. */
944 vect_verify_datarefs_alignment (vec_info
*vinfo
)
946 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
947 struct data_reference
*dr
;
948 enum dr_alignment_support supportable_dr_alignment
;
951 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
953 gimple
*stmt
= DR_STMT (dr
);
954 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
956 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
959 /* For interleaving, only the alignment of the first access matters.
960 Skip statements marked as not vectorizable. */
961 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
962 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
963 || !STMT_VINFO_VECTORIZABLE (stmt_info
))
966 /* Strided accesses perform only component accesses, alignment is
967 irrelevant for them. */
968 if (STMT_VINFO_STRIDED_P (stmt_info
)
969 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
972 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
973 if (!supportable_dr_alignment
)
975 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
979 "not vectorized: unsupported unaligned load.");
981 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
982 "not vectorized: unsupported unaligned "
985 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
,
987 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
991 if (supportable_dr_alignment
!= dr_aligned
&& dump_enabled_p ())
992 dump_printf_loc (MSG_NOTE
, vect_location
,
993 "Vectorizing an unaligned access.\n");
998 /* Given an memory reference EXP return whether its alignment is less
1002 not_size_aligned (tree exp
)
1004 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1007 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1008 > get_object_alignment (exp
));
1011 /* Function vector_alignment_reachable_p
1013 Return true if vector alignment for DR is reachable by peeling
1014 a few loop iterations. Return false otherwise. */
1017 vector_alignment_reachable_p (struct data_reference
*dr
)
1019 gimple
*stmt
= DR_STMT (dr
);
1020 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1021 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1023 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1025 /* For interleaved access we peel only if number of iterations in
1026 the prolog loop ({VF - misalignment}), is a multiple of the
1027 number of the interleaved accesses. */
1028 int elem_size
, mis_in_elements
;
1029 int nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1031 /* FORNOW: handle only known alignment. */
1032 if (!known_alignment_for_access_p (dr
))
1035 elem_size
= GET_MODE_SIZE (TYPE_MODE (vectype
)) / nelements
;
1036 mis_in_elements
= DR_MISALIGNMENT (dr
) / elem_size
;
1038 if ((nelements
- mis_in_elements
) % GROUP_SIZE (stmt_info
))
1042 /* If misalignment is known at the compile time then allow peeling
1043 only if natural alignment is reachable through peeling. */
1044 if (known_alignment_for_access_p (dr
) && !aligned_access_p (dr
))
1046 HOST_WIDE_INT elmsize
=
1047 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1048 if (dump_enabled_p ())
1050 dump_printf_loc (MSG_NOTE
, vect_location
,
1051 "data size =" HOST_WIDE_INT_PRINT_DEC
, elmsize
);
1052 dump_printf (MSG_NOTE
,
1053 ". misalignment = %d.\n", DR_MISALIGNMENT (dr
));
1055 if (DR_MISALIGNMENT (dr
) % elmsize
)
1057 if (dump_enabled_p ())
1058 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1059 "data size does not divide the misalignment.\n");
1064 if (!known_alignment_for_access_p (dr
))
1066 tree type
= TREE_TYPE (DR_REF (dr
));
1067 bool is_packed
= not_size_aligned (DR_REF (dr
));
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1070 "Unknown misalignment, is_packed = %d\n",is_packed
);
1071 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
1072 || targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
))
1082 /* Calculate the cost of the memory access represented by DR. */
1085 vect_get_data_access_cost (struct data_reference
*dr
,
1086 unsigned int *inside_cost
,
1087 unsigned int *outside_cost
,
1088 stmt_vector_for_cost
*body_cost_vec
)
1090 gimple
*stmt
= DR_STMT (dr
);
1091 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1092 int nunits
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
1093 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1094 int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1095 int ncopies
= vf
/ nunits
;
1097 if (DR_IS_READ (dr
))
1098 vect_get_load_cost (dr
, ncopies
, true, inside_cost
, outside_cost
,
1099 NULL
, body_cost_vec
, false);
1101 vect_get_store_cost (dr
, ncopies
, inside_cost
, body_cost_vec
);
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_NOTE
, vect_location
,
1105 "vect_get_data_access_cost: inside_cost = %d, "
1106 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1110 typedef struct _vect_peel_info
1113 struct data_reference
*dr
;
1117 typedef struct _vect_peel_extended_info
1119 struct _vect_peel_info peel_info
;
1120 unsigned int inside_cost
;
1121 unsigned int outside_cost
;
1122 stmt_vector_for_cost body_cost_vec
;
1123 } *vect_peel_extended_info
;
1126 /* Peeling hashtable helpers. */
1128 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1130 static inline hashval_t
hash (const _vect_peel_info
*);
1131 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1135 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1137 return (hashval_t
) peel_info
->npeel
;
1141 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1143 return (a
->npeel
== b
->npeel
);
1147 /* Insert DR into peeling hash table with NPEEL as key. */
1150 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1151 loop_vec_info loop_vinfo
, struct data_reference
*dr
,
1154 struct _vect_peel_info elem
, *slot
;
1155 _vect_peel_info
**new_slot
;
1156 bool supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1159 slot
= peeling_htab
->find (&elem
);
1164 slot
= XNEW (struct _vect_peel_info
);
1165 slot
->npeel
= npeel
;
1168 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1172 if (!supportable_dr_alignment
1173 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1174 slot
->count
+= VECT_MAX_COST
;
1178 /* Traverse peeling hash table to find peeling option that aligns maximum
1179 number of data accesses. */
1182 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1183 _vect_peel_extended_info
*max
)
1185 vect_peel_info elem
= *slot
;
1187 if (elem
->count
> max
->peel_info
.count
1188 || (elem
->count
== max
->peel_info
.count
1189 && max
->peel_info
.npeel
> elem
->npeel
))
1191 max
->peel_info
.npeel
= elem
->npeel
;
1192 max
->peel_info
.count
= elem
->count
;
1193 max
->peel_info
.dr
= elem
->dr
;
1200 /* Traverse peeling hash table and calculate cost for each peeling option.
1201 Find the one with the lowest cost. */
1204 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1205 _vect_peel_extended_info
*min
)
1207 vect_peel_info elem
= *slot
;
1208 int save_misalignment
, dummy
;
1209 unsigned int inside_cost
= 0, outside_cost
= 0, i
;
1210 gimple
*stmt
= DR_STMT (elem
->dr
);
1211 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1212 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
1213 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1214 struct data_reference
*dr
;
1215 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
, epilogue_cost_vec
;
1217 prologue_cost_vec
.create (2);
1218 body_cost_vec
.create (2);
1219 epilogue_cost_vec
.create (2);
1221 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1223 stmt
= DR_STMT (dr
);
1224 stmt_info
= vinfo_for_stmt (stmt
);
1225 /* For interleaving, only the alignment of the first access
1227 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1228 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1231 save_misalignment
= DR_MISALIGNMENT (dr
);
1232 vect_update_misalignment_for_peel (dr
, elem
->dr
, elem
->npeel
);
1233 vect_get_data_access_cost (dr
, &inside_cost
, &outside_cost
,
1235 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1238 outside_cost
+= vect_get_known_peeling_cost
1239 (loop_vinfo
, elem
->npeel
, &dummy
,
1240 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1241 &prologue_cost_vec
, &epilogue_cost_vec
);
1243 /* Prologue and epilogue costs are added to the target model later.
1244 These costs depend only on the scalar iteration cost, the
1245 number of peeling iterations finally chosen, and the number of
1246 misaligned statements. So discard the information found here. */
1247 prologue_cost_vec
.release ();
1248 epilogue_cost_vec
.release ();
1250 if (inside_cost
< min
->inside_cost
1251 || (inside_cost
== min
->inside_cost
&& outside_cost
< min
->outside_cost
))
1253 min
->inside_cost
= inside_cost
;
1254 min
->outside_cost
= outside_cost
;
1255 min
->body_cost_vec
.release ();
1256 min
->body_cost_vec
= body_cost_vec
;
1257 min
->peel_info
.dr
= elem
->dr
;
1258 min
->peel_info
.npeel
= elem
->npeel
;
1261 body_cost_vec
.release ();
1267 /* Choose best peeling option by traversing peeling hash table and either
1268 choosing an option with the lowest cost (if cost model is enabled) or the
1269 option that aligns as many accesses as possible. */
1271 static struct data_reference
*
1272 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1273 loop_vec_info loop_vinfo
,
1274 unsigned int *npeel
,
1275 stmt_vector_for_cost
*body_cost_vec
)
1277 struct _vect_peel_extended_info res
;
1279 res
.peel_info
.dr
= NULL
;
1280 res
.body_cost_vec
= stmt_vector_for_cost ();
1282 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1284 res
.inside_cost
= INT_MAX
;
1285 res
.outside_cost
= INT_MAX
;
1286 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1287 vect_peeling_hash_get_lowest_cost
> (&res
);
1291 res
.peel_info
.count
= 0;
1292 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1293 vect_peeling_hash_get_most_frequent
> (&res
);
1296 *npeel
= res
.peel_info
.npeel
;
1297 *body_cost_vec
= res
.body_cost_vec
;
1298 return res
.peel_info
.dr
;
1302 /* Function vect_enhance_data_refs_alignment
1304 This pass will use loop versioning and loop peeling in order to enhance
1305 the alignment of data references in the loop.
1307 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1308 original loop is to be vectorized. Any other loops that are created by
1309 the transformations performed in this pass - are not supposed to be
1310 vectorized. This restriction will be relaxed.
1312 This pass will require a cost model to guide it whether to apply peeling
1313 or versioning or a combination of the two. For example, the scheme that
1314 intel uses when given a loop with several memory accesses, is as follows:
1315 choose one memory access ('p') which alignment you want to force by doing
1316 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1317 other accesses are not necessarily aligned, or (2) use loop versioning to
1318 generate one loop in which all accesses are aligned, and another loop in
1319 which only 'p' is necessarily aligned.
1321 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1322 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1323 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1325 Devising a cost model is the most critical aspect of this work. It will
1326 guide us on which access to peel for, whether to use loop versioning, how
1327 many versions to create, etc. The cost model will probably consist of
1328 generic considerations as well as target specific considerations (on
1329 powerpc for example, misaligned stores are more painful than misaligned
1332 Here are the general steps involved in alignment enhancements:
1334 -- original loop, before alignment analysis:
1335 for (i=0; i<N; i++){
1336 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1337 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1340 -- After vect_compute_data_refs_alignment:
1341 for (i=0; i<N; i++){
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1346 -- Possibility 1: we do loop versioning:
1348 for (i=0; i<N; i++){ # loop 1A
1349 x = q[i]; # DR_MISALIGNMENT(q) = 3
1350 p[i] = y; # DR_MISALIGNMENT(p) = 0
1354 for (i=0; i<N; i++){ # loop 1B
1355 x = q[i]; # DR_MISALIGNMENT(q) = 3
1356 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1360 -- Possibility 2: we do loop peeling:
1361 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1365 for (i = 3; i < N; i++){ # loop 2A
1366 x = q[i]; # DR_MISALIGNMENT(q) = 0
1367 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1370 -- Possibility 3: combination of loop peeling and versioning:
1371 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1376 for (i = 3; i<N; i++){ # loop 3A
1377 x = q[i]; # DR_MISALIGNMENT(q) = 0
1378 p[i] = y; # DR_MISALIGNMENT(p) = 0
1382 for (i = 3; i<N; i++){ # loop 3B
1383 x = q[i]; # DR_MISALIGNMENT(q) = 0
1384 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1388 These loops are later passed to loop_transform to be vectorized. The
1389 vectorizer will use the alignment information to guide the transformation
1390 (whether to generate regular loads/stores, or with special handling for
1394 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1396 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1397 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1398 enum dr_alignment_support supportable_dr_alignment
;
1399 struct data_reference
*dr0
= NULL
, *first_store
= NULL
;
1400 struct data_reference
*dr
;
1402 bool do_peeling
= false;
1403 bool do_versioning
= false;
1406 stmt_vec_info stmt_info
;
1407 unsigned int npeel
= 0;
1408 bool all_misalignments_unknown
= true;
1409 unsigned int vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1410 unsigned possible_npeel_number
= 1;
1412 unsigned int nelements
, mis
, same_align_drs_max
= 0;
1413 stmt_vector_for_cost body_cost_vec
= stmt_vector_for_cost ();
1414 hash_table
<peel_info_hasher
> peeling_htab (1);
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE
, vect_location
,
1418 "=== vect_enhance_data_refs_alignment ===\n");
1420 /* Reset data so we can safely be called multiple times. */
1421 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1422 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1424 /* While cost model enhancements are expected in the future, the high level
1425 view of the code at this time is as follows:
1427 A) If there is a misaligned access then see if peeling to align
1428 this access can make all data references satisfy
1429 vect_supportable_dr_alignment. If so, update data structures
1430 as needed and return true.
1432 B) If peeling wasn't possible and there is a data reference with an
1433 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1434 then see if loop versioning checks can be used to make all data
1435 references satisfy vect_supportable_dr_alignment. If so, update
1436 data structures as needed and return true.
1438 C) If neither peeling nor versioning were successful then return false if
1439 any data reference does not satisfy vect_supportable_dr_alignment.
1441 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1443 Note, Possibility 3 above (which is peeling and versioning together) is not
1444 being done at this time. */
1446 /* (1) Peeling to force alignment. */
1448 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1450 + How many accesses will become aligned due to the peeling
1451 - How many accesses will become unaligned due to the peeling,
1452 and the cost of misaligned accesses.
1453 - The cost of peeling (the extra runtime checks, the increase
1456 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1458 stmt
= DR_STMT (dr
);
1459 stmt_info
= vinfo_for_stmt (stmt
);
1461 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1464 /* For interleaving, only the alignment of the first access
1466 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1467 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1470 /* For invariant accesses there is nothing to enhance. */
1471 if (integer_zerop (DR_STEP (dr
)))
1474 /* Strided accesses perform only component accesses, alignment is
1475 irrelevant for them. */
1476 if (STMT_VINFO_STRIDED_P (stmt_info
)
1477 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1480 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, true);
1481 do_peeling
= vector_alignment_reachable_p (dr
);
1484 if (known_alignment_for_access_p (dr
))
1486 unsigned int npeel_tmp
;
1487 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1488 size_zero_node
) < 0;
1490 /* Save info about DR in the hash table. */
1491 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1492 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1493 mis
= DR_MISALIGNMENT (dr
) / GET_MODE_SIZE (TYPE_MODE (
1494 TREE_TYPE (DR_REF (dr
))));
1495 npeel_tmp
= (negative
1496 ? (mis
- nelements
) : (nelements
- mis
))
1499 /* For multiple types, it is possible that the bigger type access
1500 will have more than one peeling option. E.g., a loop with two
1501 types: one of size (vector size / 4), and the other one of
1502 size (vector size / 8). Vectorization factor will 8. If both
1503 access are misaligned by 3, the first one needs one scalar
1504 iteration to be aligned, and the second one needs 5. But the
1505 the first one will be aligned also by peeling 5 scalar
1506 iterations, and in that case both accesses will be aligned.
1507 Hence, except for the immediate peeling amount, we also want
1508 to try to add full vector size, while we don't exceed
1509 vectorization factor.
1510 We do this automtically for cost model, since we calculate cost
1511 for every peeling option. */
1512 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1514 if (STMT_SLP_TYPE (stmt_info
))
1515 possible_npeel_number
1516 = (vf
* GROUP_SIZE (stmt_info
)) / nelements
;
1518 possible_npeel_number
= vf
/ nelements
;
1521 /* Handle the aligned case. We may decide to align some other
1522 access, making DR unaligned. */
1523 if (DR_MISALIGNMENT (dr
) == 0)
1526 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1527 possible_npeel_number
++;
1530 for (j
= 0; j
< possible_npeel_number
; j
++)
1532 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1534 npeel_tmp
+= nelements
;
1537 all_misalignments_unknown
= false;
1538 /* Data-ref that was chosen for the case that all the
1539 misalignments are unknown is not relevant anymore, since we
1540 have a data-ref with known alignment. */
1545 /* If we don't know any misalignment values, we prefer
1546 peeling for data-ref that has the maximum number of data-refs
1547 with the same alignment, unless the target prefers to align
1548 stores over load. */
1549 if (all_misalignments_unknown
)
1551 unsigned same_align_drs
1552 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info
).length ();
1554 || same_align_drs_max
< same_align_drs
)
1556 same_align_drs_max
= same_align_drs
;
1559 /* For data-refs with the same number of related
1560 accesses prefer the one where the misalign
1561 computation will be invariant in the outermost loop. */
1562 else if (same_align_drs_max
== same_align_drs
)
1564 struct loop
*ivloop0
, *ivloop
;
1565 ivloop0
= outermost_invariant_loop_for_expr
1566 (loop
, DR_BASE_ADDRESS (dr0
));
1567 ivloop
= outermost_invariant_loop_for_expr
1568 (loop
, DR_BASE_ADDRESS (dr
));
1569 if ((ivloop
&& !ivloop0
)
1570 || (ivloop
&& ivloop0
1571 && flow_loop_nested_p (ivloop
, ivloop0
)))
1575 if (!first_store
&& DR_IS_WRITE (dr
))
1579 /* If there are both known and unknown misaligned accesses in the
1580 loop, we choose peeling amount according to the known
1582 if (!supportable_dr_alignment
)
1585 if (!first_store
&& DR_IS_WRITE (dr
))
1592 if (!aligned_access_p (dr
))
1594 if (dump_enabled_p ())
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1596 "vector alignment may not be reachable\n");
1602 /* Check if we can possibly peel the loop. */
1603 if (!vect_can_advance_ivs_p (loop_vinfo
)
1604 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1609 && all_misalignments_unknown
1610 && vect_supportable_dr_alignment (dr0
, false))
1612 /* Check if the target requires to prefer stores over loads, i.e., if
1613 misaligned stores are more expensive than misaligned loads (taking
1614 drs with same alignment into account). */
1615 if (first_store
&& DR_IS_READ (dr0
))
1617 unsigned int load_inside_cost
= 0, load_outside_cost
= 0;
1618 unsigned int store_inside_cost
= 0, store_outside_cost
= 0;
1619 unsigned int load_inside_penalty
= 0, load_outside_penalty
= 0;
1620 unsigned int store_inside_penalty
= 0, store_outside_penalty
= 0;
1621 stmt_vector_for_cost dummy
;
1624 vect_get_data_access_cost (dr0
, &load_inside_cost
, &load_outside_cost
,
1626 vect_get_data_access_cost (first_store
, &store_inside_cost
,
1627 &store_outside_cost
, &dummy
);
1631 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1632 aligning the load DR0). */
1633 load_inside_penalty
= store_inside_cost
;
1634 load_outside_penalty
= store_outside_cost
;
1636 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1637 DR_STMT (first_store
))).iterate (i
, &dr
);
1639 if (DR_IS_READ (dr
))
1641 load_inside_penalty
+= load_inside_cost
;
1642 load_outside_penalty
+= load_outside_cost
;
1646 load_inside_penalty
+= store_inside_cost
;
1647 load_outside_penalty
+= store_outside_cost
;
1650 /* Calculate the penalty for leaving DR0 unaligned (by
1651 aligning the FIRST_STORE). */
1652 store_inside_penalty
= load_inside_cost
;
1653 store_outside_penalty
= load_outside_cost
;
1655 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1656 DR_STMT (dr0
))).iterate (i
, &dr
);
1658 if (DR_IS_READ (dr
))
1660 store_inside_penalty
+= load_inside_cost
;
1661 store_outside_penalty
+= load_outside_cost
;
1665 store_inside_penalty
+= store_inside_cost
;
1666 store_outside_penalty
+= store_outside_cost
;
1669 if (load_inside_penalty
> store_inside_penalty
1670 || (load_inside_penalty
== store_inside_penalty
1671 && load_outside_penalty
> store_outside_penalty
))
1675 /* In case there are only loads with different unknown misalignments, use
1676 peeling only if it may help to align other accesses in the loop or
1677 if it may help improving load bandwith when we'd end up using
1679 tree dr0_vt
= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0
)));
1681 && !STMT_VINFO_SAME_ALIGN_REFS (
1682 vinfo_for_stmt (DR_STMT (dr0
))).length ()
1683 && (vect_supportable_dr_alignment (dr0
, false)
1684 != dr_unaligned_supported
1685 || (builtin_vectorization_cost (vector_load
, dr0_vt
, 0)
1686 == builtin_vectorization_cost (unaligned_load
, dr0_vt
, -1))))
1690 if (do_peeling
&& !dr0
)
1692 /* Peeling is possible, but there is no data access that is not supported
1693 unless aligned. So we try to choose the best possible peeling. */
1695 /* We should get here only if there are drs with known misalignment. */
1696 gcc_assert (!all_misalignments_unknown
);
1698 /* Choose the best peeling from the hash table. */
1699 dr0
= vect_peeling_hash_choose_best_peeling (&peeling_htab
,
1708 stmt
= DR_STMT (dr0
);
1709 stmt_info
= vinfo_for_stmt (stmt
);
1710 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1711 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1713 if (known_alignment_for_access_p (dr0
))
1715 bool negative
= tree_int_cst_compare (DR_STEP (dr0
),
1716 size_zero_node
) < 0;
1719 /* Since it's known at compile time, compute the number of
1720 iterations in the peeled loop (the peeling factor) for use in
1721 updating DR_MISALIGNMENT values. The peeling factor is the
1722 vectorization factor minus the misalignment as an element
1724 mis
= DR_MISALIGNMENT (dr0
);
1725 mis
/= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0
))));
1726 npeel
= ((negative
? mis
- nelements
: nelements
- mis
)
1730 /* For interleaved data access every iteration accesses all the
1731 members of the group, therefore we divide the number of iterations
1732 by the group size. */
1733 stmt_info
= vinfo_for_stmt (DR_STMT (dr0
));
1734 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1735 npeel
/= GROUP_SIZE (stmt_info
);
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE
, vect_location
,
1739 "Try peeling by %d\n", npeel
);
1742 /* Ensure that all data refs can be vectorized after the peel. */
1743 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1745 int save_misalignment
;
1750 stmt
= DR_STMT (dr
);
1751 stmt_info
= vinfo_for_stmt (stmt
);
1752 /* For interleaving, only the alignment of the first access
1754 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1755 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
)
1758 /* Strided accesses perform only component accesses, alignment is
1759 irrelevant for them. */
1760 if (STMT_VINFO_STRIDED_P (stmt_info
)
1761 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1764 save_misalignment
= DR_MISALIGNMENT (dr
);
1765 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1766 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1767 SET_DR_MISALIGNMENT (dr
, save_misalignment
);
1769 if (!supportable_dr_alignment
)
1776 if (do_peeling
&& known_alignment_for_access_p (dr0
) && npeel
== 0)
1778 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1783 body_cost_vec
.release ();
1788 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1791 unsigned max_allowed_peel
1792 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT
);
1793 if (max_allowed_peel
!= (unsigned)-1)
1795 unsigned max_peel
= npeel
;
1798 gimple
*dr_stmt
= DR_STMT (dr0
);
1799 stmt_vec_info vinfo
= vinfo_for_stmt (dr_stmt
);
1800 tree vtype
= STMT_VINFO_VECTYPE (vinfo
);
1801 max_peel
= TYPE_VECTOR_SUBPARTS (vtype
) - 1;
1803 if (max_peel
> max_allowed_peel
)
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE
, vect_location
,
1808 "Disable peeling, max peels reached: %d\n", max_peel
);
1813 /* Cost model #2 - if peeling may result in a remaining loop not
1814 iterating enough to be vectorized then do not peel. */
1816 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
1819 = npeel
== 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo
) - 1 : npeel
;
1820 if (LOOP_VINFO_INT_NITERS (loop_vinfo
)
1821 < LOOP_VINFO_VECT_FACTOR (loop_vinfo
) + max_peel
)
1827 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1828 If the misalignment of DR_i is identical to that of dr0 then set
1829 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1830 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1831 by the peeling factor times the element size of DR_i (MOD the
1832 vectorization factor times the size). Otherwise, the
1833 misalignment of DR_i must be set to unknown. */
1834 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1836 vect_update_misalignment_for_peel (dr
, dr0
, npeel
);
1838 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0
;
1840 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
1843 = DR_MISALIGNMENT (dr0
);
1844 SET_DR_MISALIGNMENT (dr0
, 0);
1845 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE
, vect_location
,
1848 "Alignment of access forced using peeling.\n");
1849 dump_printf_loc (MSG_NOTE
, vect_location
,
1850 "Peeling for alignment will be applied.\n");
1852 /* The inside-loop cost will be accounted for in vectorizable_load
1853 and vectorizable_store correctly with adjusted alignments.
1854 Drop the body_cst_vec on the floor here. */
1855 body_cost_vec
.release ();
1857 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1863 body_cost_vec
.release ();
1865 /* (2) Versioning to force alignment. */
1867 /* Try versioning if:
1868 1) optimize loop for speed
1869 2) there is at least one unsupported misaligned data ref with an unknown
1871 3) all misaligned data refs with a known misalignment are supported, and
1872 4) the number of runtime alignment checks is within reason. */
1875 optimize_loop_nest_for_speed_p (loop
)
1876 && (!loop
->inner
); /* FORNOW */
1880 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1882 stmt
= DR_STMT (dr
);
1883 stmt_info
= vinfo_for_stmt (stmt
);
1885 /* For interleaving, only the alignment of the first access
1887 if (aligned_access_p (dr
)
1888 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1889 && GROUP_FIRST_ELEMENT (stmt_info
) != stmt
))
1892 if (STMT_VINFO_STRIDED_P (stmt_info
))
1894 /* Strided loads perform only component accesses, alignment is
1895 irrelevant for them. */
1896 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1898 do_versioning
= false;
1902 supportable_dr_alignment
= vect_supportable_dr_alignment (dr
, false);
1904 if (!supportable_dr_alignment
)
1910 if (known_alignment_for_access_p (dr
)
1911 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
1912 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS
))
1914 do_versioning
= false;
1918 stmt
= DR_STMT (dr
);
1919 vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
1920 gcc_assert (vectype
);
1922 /* The rightmost bits of an aligned address must be zeros.
1923 Construct the mask needed for this test. For example,
1924 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1925 mask must be 15 = 0xf. */
1926 mask
= GET_MODE_SIZE (TYPE_MODE (vectype
)) - 1;
1928 /* FORNOW: use the same mask to test all potentially unaligned
1929 references in the loop. The vectorizer currently supports
1930 a single vector size, see the reference to
1931 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1932 vectorization factor is computed. */
1933 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo
)
1934 || LOOP_VINFO_PTR_MASK (loop_vinfo
) == mask
);
1935 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
1936 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (
1941 /* Versioning requires at least one misaligned data reference. */
1942 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
1943 do_versioning
= false;
1944 else if (!do_versioning
)
1945 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1950 vec
<gimple
*> may_misalign_stmts
1951 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
1954 /* It can now be assumed that the data references in the statements
1955 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1956 of the loop being vectorized. */
1957 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt
)
1959 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
1960 dr
= STMT_VINFO_DATA_REF (stmt_info
);
1961 SET_DR_MISALIGNMENT (dr
, 0);
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE
, vect_location
,
1964 "Alignment of access forced using versioning.\n");
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_NOTE
, vect_location
,
1969 "Versioning for alignment will be applied.\n");
1971 /* Peeling and versioning can't be done together at this time. */
1972 gcc_assert (! (do_peeling
&& do_versioning
));
1974 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1979 /* This point is reached if neither peeling nor versioning is being done. */
1980 gcc_assert (! (do_peeling
|| do_versioning
));
1982 stat
= vect_verify_datarefs_alignment (loop_vinfo
);
1987 /* Function vect_find_same_alignment_drs.
1989 Update group and alignment relations according to the chosen
1990 vectorization factor. */
1993 vect_find_same_alignment_drs (struct data_dependence_relation
*ddr
,
1994 loop_vec_info loop_vinfo
)
1997 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1998 int vectorization_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1999 struct data_reference
*dra
= DDR_A (ddr
);
2000 struct data_reference
*drb
= DDR_B (ddr
);
2001 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2002 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2003 int dra_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra
))));
2004 int drb_size
= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb
))));
2005 lambda_vector dist_v
;
2006 unsigned int loop_depth
;
2008 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
2014 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
2017 /* Loop-based vectorization and known data dependence. */
2018 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
2021 /* Data-dependence analysis reports a distance vector of zero
2022 for data-references that overlap only in the first iteration
2023 but have different sign step (see PR45764).
2024 So as a sanity check require equal DR_STEP. */
2025 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2028 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
2029 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
2031 int dist
= dist_v
[loop_depth
];
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE
, vect_location
,
2035 "dependence distance = %d.\n", dist
);
2037 /* Same loop iteration. */
2039 || (dist
% vectorization_factor
== 0 && dra_size
== drb_size
))
2041 /* Two references with distance zero have the same alignment. */
2042 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a
).safe_push (drb
);
2043 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b
).safe_push (dra
);
2044 if (dump_enabled_p ())
2046 dump_printf_loc (MSG_NOTE
, vect_location
,
2047 "accesses have the same alignment.\n");
2048 dump_printf (MSG_NOTE
,
2049 "dependence distance modulo vf == 0 between ");
2050 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2051 dump_printf (MSG_NOTE
, " and ");
2052 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2053 dump_printf (MSG_NOTE
, "\n");
2060 /* Function vect_analyze_data_refs_alignment
2062 Analyze the alignment of the data-references in the loop.
2063 Return FALSE if a data reference is found that cannot be vectorized. */
2066 vect_analyze_data_refs_alignment (vec_info
*vinfo
)
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE
, vect_location
,
2070 "=== vect_analyze_data_refs_alignment ===\n");
2072 /* Mark groups of data references with same alignment using
2073 data dependence information. */
2074 if (is_a
<loop_vec_info
> (vinfo
))
2076 vec
<ddr_p
> ddrs
= vinfo
->ddrs
;
2077 struct data_dependence_relation
*ddr
;
2080 FOR_EACH_VEC_ELT (ddrs
, i
, ddr
)
2081 vect_find_same_alignment_drs (ddr
, as_a
<loop_vec_info
> (vinfo
));
2084 if (!vect_compute_data_refs_alignment (vinfo
))
2086 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2088 "not vectorized: can't calculate alignment "
2097 /* Analyze groups of accesses: check that DR belongs to a group of
2098 accesses of legal size, step, etc. Detect gaps, single element
2099 interleaving, and other special cases. Set grouped access info.
2100 Collect groups of strided stores for further use in SLP analysis.
2101 Worker for vect_analyze_group_access. */
2104 vect_analyze_group_access_1 (struct data_reference
*dr
)
2106 tree step
= DR_STEP (dr
);
2107 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2108 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2109 gimple
*stmt
= DR_STMT (dr
);
2110 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2111 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2112 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
2113 HOST_WIDE_INT dr_step
= -1;
2114 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2115 bool slp_impossible
= false;
2116 struct loop
*loop
= NULL
;
2119 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2121 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2122 size of the interleaving group (including gaps). */
2123 if (tree_fits_shwi_p (step
))
2125 dr_step
= tree_to_shwi (step
);
2126 groupsize
= absu_hwi (dr_step
) / type_size
;
2131 /* Not consecutive access is possible only if it is a part of interleaving. */
2132 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)))
2134 /* Check if it this DR is a part of interleaving, and is a single
2135 element of the group that is accessed in the loop. */
2137 /* Gaps are supported only for loads. STEP must be a multiple of the type
2138 size. The size of the group must be a power of 2. */
2140 && (dr_step
% type_size
) == 0
2142 && exact_log2 (groupsize
) != -1)
2144 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = stmt
;
2145 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2146 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_NOTE
, vect_location
,
2149 "Detected single element interleaving ");
2150 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dr
));
2151 dump_printf (MSG_NOTE
, " step ");
2152 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, step
);
2153 dump_printf (MSG_NOTE
, "\n");
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE
, vect_location
,
2160 "Data access with gaps requires scalar "
2164 if (dump_enabled_p ())
2165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2166 "Peeling for outer loop is not"
2171 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2177 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2180 "not consecutive access ");
2181 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
2186 /* Mark the statement as unvectorizable. */
2187 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2191 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2192 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2196 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) == stmt
)
2198 /* First stmt in the interleaving chain. Check the chain. */
2199 gimple
*next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt
));
2200 struct data_reference
*data_ref
= dr
;
2201 unsigned int count
= 1;
2202 tree prev_init
= DR_INIT (data_ref
);
2203 gimple
*prev
= stmt
;
2204 HOST_WIDE_INT diff
, gaps
= 0;
2208 /* Skip same data-refs. In case that two or more stmts share
2209 data-ref (supported only for loads), we vectorize only the first
2210 stmt, and the rest get their vectorized loads from the first
2212 if (!tree_int_cst_compare (DR_INIT (data_ref
),
2213 DR_INIT (STMT_VINFO_DATA_REF (
2214 vinfo_for_stmt (next
)))))
2216 if (DR_IS_WRITE (data_ref
))
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2220 "Two store stmts share the same dr.\n");
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2226 "Two or more load stmts share the same dr.\n");
2228 /* For load use the same data-ref load. */
2229 GROUP_SAME_DR_STMT (vinfo_for_stmt (next
)) = prev
;
2232 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2237 data_ref
= STMT_VINFO_DATA_REF (vinfo_for_stmt (next
));
2239 /* All group members have the same STEP by construction. */
2240 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2242 /* Check that the distance between two accesses is equal to the type
2243 size. Otherwise, we have gaps. */
2244 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2245 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2248 /* FORNOW: SLP of accesses with gaps is not supported. */
2249 slp_impossible
= true;
2250 if (DR_IS_WRITE (data_ref
))
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2254 "interleaved store with gaps\n");
2261 last_accessed_element
+= diff
;
2263 /* Store the gap from the previous member of the group. If there is no
2264 gap in the access, GROUP_GAP is always 1. */
2265 GROUP_GAP (vinfo_for_stmt (next
)) = diff
;
2267 prev_init
= DR_INIT (data_ref
);
2268 next
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next
));
2269 /* Count the number of data-refs in the chain. */
2274 groupsize
= count
+ gaps
;
2276 if (groupsize
> UINT_MAX
)
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2280 "group is too large\n");
2284 /* Check that the size of the interleaving is equal to count for stores,
2285 i.e., that there are no gaps. */
2286 if (groupsize
!= count
2287 && !DR_IS_READ (dr
))
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2291 "interleaved store with gaps\n");
2295 /* If there is a gap after the last load in the group it is the
2296 difference between the groupsize and the last accessed
2298 When there is no gap, this difference should be 0. */
2299 GROUP_GAP (vinfo_for_stmt (stmt
)) = groupsize
- last_accessed_element
;
2301 GROUP_SIZE (vinfo_for_stmt (stmt
)) = groupsize
;
2302 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE
, vect_location
,
2305 "Detected interleaving ");
2306 if (DR_IS_READ (dr
))
2307 dump_printf (MSG_NOTE
, "load ");
2309 dump_printf (MSG_NOTE
, "store ");
2310 dump_printf (MSG_NOTE
, "of size %u starting with ",
2311 (unsigned)groupsize
);
2312 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
2313 if (GROUP_GAP (vinfo_for_stmt (stmt
)) != 0)
2314 dump_printf_loc (MSG_NOTE
, vect_location
,
2315 "There is a gap of %u elements after the group\n",
2316 GROUP_GAP (vinfo_for_stmt (stmt
)));
2319 /* SLP: create an SLP data structure for every interleaving group of
2320 stores for further analysis in vect_analyse_slp. */
2321 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2324 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt
);
2326 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt
);
2329 /* If there is a gap in the end of the group or the group size cannot
2330 be made a multiple of the vector element count then we access excess
2331 elements in the last iteration and thus need to peel that off. */
2333 && (groupsize
- last_accessed_element
> 0
2334 || exact_log2 (groupsize
) == -1))
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2339 "Data access with gaps requires scalar "
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2345 "Peeling for outer loop is not supported\n");
2349 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo
) = true;
2356 /* Analyze groups of accesses: check that DR belongs to a group of
2357 accesses of legal size, step, etc. Detect gaps, single element
2358 interleaving, and other special cases. Set grouped access info.
2359 Collect groups of strided stores for further use in SLP analysis. */
2362 vect_analyze_group_access (struct data_reference
*dr
)
2364 if (!vect_analyze_group_access_1 (dr
))
2366 /* Dissolve the group if present. */
2368 gimple
*stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr
)));
2371 stmt_vec_info vinfo
= vinfo_for_stmt (stmt
);
2372 next
= GROUP_NEXT_ELEMENT (vinfo
);
2373 GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2374 GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2382 /* Analyze the access pattern of the data-reference DR.
2383 In case of non-consecutive accesses call vect_analyze_group_access() to
2384 analyze groups of accesses. */
2387 vect_analyze_data_ref_access (struct data_reference
*dr
)
2389 tree step
= DR_STEP (dr
);
2390 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2391 gimple
*stmt
= DR_STMT (dr
);
2392 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
2393 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
2394 struct loop
*loop
= NULL
;
2397 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2399 if (loop_vinfo
&& !step
)
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2403 "bad data-ref access in loop\n");
2407 /* Allow loads with zero step in inner-loop vectorization. */
2408 if (loop_vinfo
&& integer_zerop (step
))
2410 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2411 if (!nested_in_vect_loop_p (loop
, stmt
))
2412 return DR_IS_READ (dr
);
2413 /* Allow references with zero step for outer loops marked
2414 with pragma omp simd only - it guarantees absence of
2415 loop-carried dependencies between inner loop iterations. */
2416 if (!loop
->force_vectorize
)
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE
, vect_location
,
2420 "zero step in inner loop of nest\n");
2425 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2427 /* Interleaved accesses are not yet supported within outer-loop
2428 vectorization for references in the inner-loop. */
2429 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2431 /* For the rest of the analysis we use the outer-loop step. */
2432 step
= STMT_VINFO_DR_STEP (stmt_info
);
2433 if (integer_zerop (step
))
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE
, vect_location
,
2437 "zero step in outer loop.\n");
2438 return DR_IS_READ (dr
);
2443 if (TREE_CODE (step
) == INTEGER_CST
)
2445 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2446 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2448 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2450 /* Mark that it is not interleaving. */
2451 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
)) = NULL
;
2456 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE
, vect_location
,
2460 "grouped access in outer loop.\n");
2465 /* Assume this is a DR handled by non-constant strided load case. */
2466 if (TREE_CODE (step
) != INTEGER_CST
)
2467 return (STMT_VINFO_STRIDED_P (stmt_info
)
2468 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2469 || vect_analyze_group_access (dr
)));
2471 /* Not consecutive access - check if it's a part of interleaving group. */
2472 return vect_analyze_group_access (dr
);
2477 /* A helper function used in the comparator function to sort data
2478 references. T1 and T2 are two data references to be compared.
2479 The function returns -1, 0, or 1. */
2482 compare_tree (tree t1
, tree t2
)
2485 enum tree_code code
;
2496 if (TREE_CODE (t1
) != TREE_CODE (t2
))
2497 return TREE_CODE (t1
) < TREE_CODE (t2
) ? -1 : 1;
2499 code
= TREE_CODE (t1
);
2502 /* For const values, we can just use hash values for comparisons. */
2510 hashval_t h1
= iterative_hash_expr (t1
, 0);
2511 hashval_t h2
= iterative_hash_expr (t2
, 0);
2513 return h1
< h2
? -1 : 1;
2518 cmp
= compare_tree (SSA_NAME_VAR (t1
), SSA_NAME_VAR (t2
));
2522 if (SSA_NAME_VERSION (t1
) != SSA_NAME_VERSION (t2
))
2523 return SSA_NAME_VERSION (t1
) < SSA_NAME_VERSION (t2
) ? -1 : 1;
2527 tclass
= TREE_CODE_CLASS (code
);
2529 /* For var-decl, we could compare their UIDs. */
2530 if (tclass
== tcc_declaration
)
2532 if (DECL_UID (t1
) != DECL_UID (t2
))
2533 return DECL_UID (t1
) < DECL_UID (t2
) ? -1 : 1;
2537 /* For expressions with operands, compare their operands recursively. */
2538 for (i
= TREE_OPERAND_LENGTH (t1
) - 1; i
>= 0; --i
)
2540 cmp
= compare_tree (TREE_OPERAND (t1
, i
), TREE_OPERAND (t2
, i
));
2550 /* Compare two data-references DRA and DRB to group them into chunks
2551 suitable for grouping. */
2554 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2556 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
2557 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
2560 /* Stabilize sort. */
2564 /* Ordering of DRs according to base. */
2565 if (!operand_equal_p (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
), 0))
2567 cmp
= compare_tree (DR_BASE_ADDRESS (dra
), DR_BASE_ADDRESS (drb
));
2572 /* And according to DR_OFFSET. */
2573 if (!dr_equal_offsets_p (dra
, drb
))
2575 cmp
= compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2580 /* Put reads before writes. */
2581 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2582 return DR_IS_READ (dra
) ? -1 : 1;
2584 /* Then sort after access size. */
2585 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2586 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))), 0))
2588 cmp
= compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2589 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2594 /* And after step. */
2595 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2597 cmp
= compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2602 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2603 cmp
= tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
));
2605 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2609 /* Function vect_analyze_data_ref_accesses.
2611 Analyze the access pattern of all the data references in the loop.
2613 FORNOW: the only access pattern that is considered vectorizable is a
2614 simple step 1 (consecutive) access.
2616 FORNOW: handle only arrays and pointer accesses. */
2619 vect_analyze_data_ref_accesses (vec_info
*vinfo
)
2622 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
2623 struct data_reference
*dr
;
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_NOTE
, vect_location
,
2627 "=== vect_analyze_data_ref_accesses ===\n");
2629 if (datarefs
.is_empty ())
2632 /* Sort the array of datarefs to make building the interleaving chains
2633 linear. Don't modify the original vector's order, it is needed for
2634 determining what dependencies are reversed. */
2635 vec
<data_reference_p
> datarefs_copy
= datarefs
.copy ();
2636 datarefs_copy
.qsort (dr_group_sort_cmp
);
2638 /* Build the interleaving chains. */
2639 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2641 data_reference_p dra
= datarefs_copy
[i
];
2642 stmt_vec_info stmtinfo_a
= vinfo_for_stmt (DR_STMT (dra
));
2643 stmt_vec_info lastinfo
= NULL
;
2644 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2646 data_reference_p drb
= datarefs_copy
[i
];
2647 stmt_vec_info stmtinfo_b
= vinfo_for_stmt (DR_STMT (drb
));
2649 /* ??? Imperfect sorting (non-compatible types, non-modulo
2650 accesses, same accesses) can lead to a group to be artificially
2651 split here as we don't just skip over those. If it really
2652 matters we can push those to a worklist and re-iterate
2653 over them. The we can just skip ahead to the next DR here. */
2655 /* Check that the data-refs have same first location (except init)
2656 and they are both either store or load (not load and store,
2657 not masked loads or stores). */
2658 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
2659 || !operand_equal_p (DR_BASE_ADDRESS (dra
),
2660 DR_BASE_ADDRESS (drb
), 0)
2661 || !dr_equal_offsets_p (dra
, drb
)
2662 || !gimple_assign_single_p (DR_STMT (dra
))
2663 || !gimple_assign_single_p (DR_STMT (drb
)))
2666 /* Check that the data-refs have the same constant size. */
2667 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
2668 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
2669 if (!tree_fits_uhwi_p (sza
)
2670 || !tree_fits_uhwi_p (szb
)
2671 || !tree_int_cst_equal (sza
, szb
))
2674 /* Check that the data-refs have the same step. */
2675 if (!operand_equal_p (DR_STEP (dra
), DR_STEP (drb
), 0))
2678 /* Do not place the same access in the interleaving chain twice. */
2679 if (tree_int_cst_compare (DR_INIT (dra
), DR_INIT (drb
)) == 0)
2682 /* Check the types are compatible.
2683 ??? We don't distinguish this during sorting. */
2684 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
2685 TREE_TYPE (DR_REF (drb
))))
2688 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2689 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
2690 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
2691 gcc_assert (init_a
< init_b
);
2693 /* If init_b == init_a + the size of the type * k, we have an
2694 interleaving, and DRA is accessed before DRB. */
2695 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
2696 if ((init_b
- init_a
) % type_size_a
!= 0)
2699 /* If we have a store, the accesses are adjacent. This splits
2700 groups into chunks we support (we don't support vectorization
2701 of stores with gaps). */
2702 if (!DR_IS_READ (dra
)
2703 && (init_b
- (HOST_WIDE_INT
) TREE_INT_CST_LOW
2704 (DR_INIT (datarefs_copy
[i
-1]))
2708 /* If the step (if not zero or non-constant) is greater than the
2709 difference between data-refs' inits this splits groups into
2711 if (tree_fits_shwi_p (DR_STEP (dra
)))
2713 HOST_WIDE_INT step
= tree_to_shwi (DR_STEP (dra
));
2714 if (step
!= 0 && step
<= (init_b
- init_a
))
2718 if (dump_enabled_p ())
2720 dump_printf_loc (MSG_NOTE
, vect_location
,
2721 "Detected interleaving ");
2722 if (DR_IS_READ (dra
))
2723 dump_printf (MSG_NOTE
, "load ");
2725 dump_printf (MSG_NOTE
, "store ");
2726 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (dra
));
2727 dump_printf (MSG_NOTE
, " and ");
2728 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_REF (drb
));
2729 dump_printf (MSG_NOTE
, "\n");
2732 /* Link the found element into the group list. */
2733 if (!GROUP_FIRST_ELEMENT (stmtinfo_a
))
2735 GROUP_FIRST_ELEMENT (stmtinfo_a
) = DR_STMT (dra
);
2736 lastinfo
= stmtinfo_a
;
2738 GROUP_FIRST_ELEMENT (stmtinfo_b
) = DR_STMT (dra
);
2739 GROUP_NEXT_ELEMENT (lastinfo
) = DR_STMT (drb
);
2740 lastinfo
= stmtinfo_b
;
2744 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr
)
2745 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
)))
2746 && !vect_analyze_data_ref_access (dr
))
2748 if (dump_enabled_p ())
2749 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2750 "not vectorized: complicated access pattern.\n");
2752 if (is_a
<bb_vec_info
> (vinfo
))
2754 /* Mark the statement as not vectorizable. */
2755 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
2760 datarefs_copy
.release ();
2765 datarefs_copy
.release ();
2770 /* Operator == between two dr_with_seg_len objects.
2772 This equality operator is used to make sure two data refs
2773 are the same one so that we will consider to combine the
2774 aliasing checks of those two pairs of data dependent data
2778 operator == (const dr_with_seg_len
& d1
,
2779 const dr_with_seg_len
& d2
)
2781 return operand_equal_p (DR_BASE_ADDRESS (d1
.dr
),
2782 DR_BASE_ADDRESS (d2
.dr
), 0)
2783 && compare_tree (d1
.offset
, d2
.offset
) == 0
2784 && compare_tree (d1
.seg_len
, d2
.seg_len
) == 0;
2787 /* Function comp_dr_with_seg_len_pair.
2789 Comparison function for sorting objects of dr_with_seg_len_pair_t
2790 so that we can combine aliasing checks in one scan. */
2793 comp_dr_with_seg_len_pair (const void *p1_
, const void *p2_
)
2795 const dr_with_seg_len_pair_t
* p1
= (const dr_with_seg_len_pair_t
*) p1_
;
2796 const dr_with_seg_len_pair_t
* p2
= (const dr_with_seg_len_pair_t
*) p2_
;
2798 const dr_with_seg_len
&p11
= p1
->first
,
2803 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2804 if a and c have the same basic address snd step, and b and d have the same
2805 address and step. Therefore, if any a&c or b&d don't have the same address
2806 and step, we don't care the order of those two pairs after sorting. */
2809 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p11
.dr
),
2810 DR_BASE_ADDRESS (p21
.dr
))) != 0)
2812 if ((comp_res
= compare_tree (DR_BASE_ADDRESS (p12
.dr
),
2813 DR_BASE_ADDRESS (p22
.dr
))) != 0)
2815 if ((comp_res
= compare_tree (DR_STEP (p11
.dr
), DR_STEP (p21
.dr
))) != 0)
2817 if ((comp_res
= compare_tree (DR_STEP (p12
.dr
), DR_STEP (p22
.dr
))) != 0)
2819 if ((comp_res
= compare_tree (p11
.offset
, p21
.offset
)) != 0)
2821 if ((comp_res
= compare_tree (p12
.offset
, p22
.offset
)) != 0)
2827 /* Function vect_vfa_segment_size.
2829 Create an expression that computes the size of segment
2830 that will be accessed for a data reference. The functions takes into
2831 account that realignment loads may access one more vector.
2834 DR: The data reference.
2835 LENGTH_FACTOR: segment length to consider.
2837 Return an expression whose value is the size of segment which will be
2841 vect_vfa_segment_size (struct data_reference
*dr
, tree length_factor
)
2843 tree segment_length
;
2845 if (integer_zerop (DR_STEP (dr
)))
2846 segment_length
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
2848 segment_length
= size_binop (MULT_EXPR
,
2849 fold_convert (sizetype
, DR_STEP (dr
)),
2850 fold_convert (sizetype
, length_factor
));
2852 if (vect_supportable_dr_alignment (dr
, false)
2853 == dr_explicit_realign_optimized
)
2855 tree vector_size
= TYPE_SIZE_UNIT
2856 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr
))));
2858 segment_length
= size_binop (PLUS_EXPR
, segment_length
, vector_size
);
2860 return segment_length
;
2863 /* Function vect_prune_runtime_alias_test_list.
2865 Prune a list of ddrs to be tested at run-time by versioning for alias.
2866 Merge several alias checks into one if possible.
2867 Return FALSE if resulting list of ddrs is longer then allowed by
2868 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2871 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
2873 vec
<ddr_p
> may_alias_ddrs
=
2874 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
2875 vec
<dr_with_seg_len_pair_t
>& comp_alias_ddrs
=
2876 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
2877 int vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
2878 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE
, vect_location
,
2886 "=== vect_prune_runtime_alias_test_list ===\n");
2888 if (may_alias_ddrs
.is_empty ())
2891 /* Basically, for each pair of dependent data refs store_ptr_0
2892 and load_ptr_0, we create an expression:
2894 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2895 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2897 for aliasing checks. However, in some cases we can decrease
2898 the number of checks by combining two checks into one. For
2899 example, suppose we have another pair of data refs store_ptr_0
2900 and load_ptr_1, and if the following condition is satisfied:
2902 load_ptr_0 < load_ptr_1 &&
2903 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2905 (this condition means, in each iteration of vectorized loop,
2906 the accessed memory of store_ptr_0 cannot be between the memory
2907 of load_ptr_0 and load_ptr_1.)
2909 we then can use only the following expression to finish the
2910 alising checks between store_ptr_0 & load_ptr_0 and
2911 store_ptr_0 & load_ptr_1:
2913 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2914 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2916 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2917 same basic address. */
2919 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
2921 /* First, we collect all data ref pairs for aliasing checks. */
2922 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
2924 struct data_reference
*dr_a
, *dr_b
;
2925 gimple
*dr_group_first_a
, *dr_group_first_b
;
2926 tree segment_length_a
, segment_length_b
;
2927 gimple
*stmt_a
, *stmt_b
;
2930 stmt_a
= DR_STMT (DDR_A (ddr
));
2931 dr_group_first_a
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a
));
2932 if (dr_group_first_a
)
2934 stmt_a
= dr_group_first_a
;
2935 dr_a
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a
));
2939 stmt_b
= DR_STMT (DDR_B (ddr
));
2940 dr_group_first_b
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b
));
2941 if (dr_group_first_b
)
2943 stmt_b
= dr_group_first_b
;
2944 dr_b
= STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b
));
2947 if (!operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0))
2948 length_factor
= scalar_loop_iters
;
2950 length_factor
= size_int (vect_factor
);
2951 segment_length_a
= vect_vfa_segment_size (dr_a
, length_factor
);
2952 segment_length_b
= vect_vfa_segment_size (dr_b
, length_factor
);
2954 dr_with_seg_len_pair_t dr_with_seg_len_pair
2955 (dr_with_seg_len (dr_a
, segment_length_a
),
2956 dr_with_seg_len (dr_b
, segment_length_b
));
2958 if (compare_tree (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
)) > 0)
2959 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2961 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
2964 /* Second, we sort the collected data ref pairs so that we can scan
2965 them once to combine all possible aliasing checks. */
2966 comp_alias_ddrs
.qsort (comp_dr_with_seg_len_pair
);
2968 /* Third, we scan the sorted dr pairs and check if we can combine
2969 alias checks of two neighbouring dr pairs. */
2970 for (size_t i
= 1; i
< comp_alias_ddrs
.length (); ++i
)
2972 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2973 dr_with_seg_len
*dr_a1
= &comp_alias_ddrs
[i
-1].first
,
2974 *dr_b1
= &comp_alias_ddrs
[i
-1].second
,
2975 *dr_a2
= &comp_alias_ddrs
[i
].first
,
2976 *dr_b2
= &comp_alias_ddrs
[i
].second
;
2978 /* Remove duplicate data ref pairs. */
2979 if (*dr_a1
== *dr_a2
&& *dr_b1
== *dr_b2
)
2981 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_NOTE
, vect_location
,
2984 "found equal ranges ");
2985 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2986 DR_REF (dr_a1
->dr
));
2987 dump_printf (MSG_NOTE
, ", ");
2988 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2989 DR_REF (dr_b1
->dr
));
2990 dump_printf (MSG_NOTE
, " and ");
2991 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2992 DR_REF (dr_a2
->dr
));
2993 dump_printf (MSG_NOTE
, ", ");
2994 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
2995 DR_REF (dr_b2
->dr
));
2996 dump_printf (MSG_NOTE
, "\n");
2999 comp_alias_ddrs
.ordered_remove (i
--);
3003 if (*dr_a1
== *dr_a2
|| *dr_b1
== *dr_b2
)
3005 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3006 and DR_A1 and DR_A2 are two consecutive memrefs. */
3007 if (*dr_a1
== *dr_a2
)
3009 std::swap (dr_a1
, dr_b1
);
3010 std::swap (dr_a2
, dr_b2
);
3013 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1
->dr
),
3014 DR_BASE_ADDRESS (dr_a2
->dr
),
3016 || !tree_fits_shwi_p (dr_a1
->offset
)
3017 || !tree_fits_shwi_p (dr_a2
->offset
))
3020 HOST_WIDE_INT diff
= (tree_to_shwi (dr_a2
->offset
)
3021 - tree_to_shwi (dr_a1
->offset
));
3024 /* Now we check if the following condition is satisfied:
3026 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3028 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3029 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3030 have to make a best estimation. We can get the minimum value
3031 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3032 then either of the following two conditions can guarantee the
3035 1: DIFF <= MIN_SEG_LEN_B
3036 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3040 HOST_WIDE_INT min_seg_len_b
= (tree_fits_shwi_p (dr_b1
->seg_len
)
3041 ? tree_to_shwi (dr_b1
->seg_len
)
3044 if (diff
<= min_seg_len_b
3045 || (tree_fits_shwi_p (dr_a1
->seg_len
)
3046 && diff
- tree_to_shwi (dr_a1
->seg_len
) < min_seg_len_b
))
3048 if (dump_enabled_p ())
3050 dump_printf_loc (MSG_NOTE
, vect_location
,
3051 "merging ranges for ");
3052 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3053 DR_REF (dr_a1
->dr
));
3054 dump_printf (MSG_NOTE
, ", ");
3055 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3056 DR_REF (dr_b1
->dr
));
3057 dump_printf (MSG_NOTE
, " and ");
3058 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3059 DR_REF (dr_a2
->dr
));
3060 dump_printf (MSG_NOTE
, ", ");
3061 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3062 DR_REF (dr_b2
->dr
));
3063 dump_printf (MSG_NOTE
, "\n");
3066 dr_a1
->seg_len
= size_binop (PLUS_EXPR
,
3067 dr_a2
->seg_len
, size_int (diff
));
3068 comp_alias_ddrs
.ordered_remove (i
--);
3073 dump_printf_loc (MSG_NOTE
, vect_location
,
3074 "improved number of alias checks from %d to %d\n",
3075 may_alias_ddrs
.length (), comp_alias_ddrs
.length ());
3076 if ((int) comp_alias_ddrs
.length () >
3077 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS
))
3083 /* Check whether a non-affine read or write in stmt is suitable for gather load
3084 or scatter store and if so, return a builtin decl for that operation. */
3087 vect_check_gather_scatter (gimple
*stmt
, loop_vec_info loop_vinfo
, tree
*basep
,
3088 tree
*offp
, int *scalep
)
3090 HOST_WIDE_INT scale
= 1, pbitpos
, pbitsize
;
3091 struct loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3092 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3093 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3094 tree offtype
= NULL_TREE
;
3095 tree decl
, base
, off
;
3097 int punsignedp
, reversep
, pvolatilep
= 0;
3100 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3101 see if we can use the def stmt of the address. */
3102 if (is_gimple_call (stmt
)
3103 && gimple_call_internal_p (stmt
)
3104 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
3105 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
)
3106 && TREE_CODE (base
) == MEM_REF
3107 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3108 && integer_zerop (TREE_OPERAND (base
, 1))
3109 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3111 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3112 if (is_gimple_assign (def_stmt
)
3113 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3114 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3117 /* The gather and scatter builtins need address of the form
3118 loop_invariant + vector * {1, 2, 4, 8}
3120 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3121 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3122 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3123 multiplications and additions in it. To get a vector, we need
3124 a single SSA_NAME that will be defined in the loop and will
3125 contain everything that is not loop invariant and that can be
3126 vectorized. The following code attempts to find such a preexistng
3127 SSA_NAME OFF and put the loop invariants into a tree BASE
3128 that can be gimplified before the loop. */
3129 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3130 &punsignedp
, &reversep
, &pvolatilep
, false);
3131 gcc_assert (base
&& (pbitpos
% BITS_PER_UNIT
) == 0 && !reversep
);
3133 if (TREE_CODE (base
) == MEM_REF
)
3135 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3137 if (off
== NULL_TREE
)
3139 offset_int moff
= mem_ref_offset (base
);
3140 off
= wide_int_to_tree (sizetype
, moff
);
3143 off
= size_binop (PLUS_EXPR
, off
,
3144 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3146 base
= TREE_OPERAND (base
, 0);
3149 base
= build_fold_addr_expr (base
);
3151 if (off
== NULL_TREE
)
3152 off
= size_zero_node
;
3154 /* If base is not loop invariant, either off is 0, then we start with just
3155 the constant offset in the loop invariant BASE and continue with base
3156 as OFF, otherwise give up.
3157 We could handle that case by gimplifying the addition of base + off
3158 into some SSA_NAME and use that as off, but for now punt. */
3159 if (!expr_invariant_in_loop_p (loop
, base
))
3161 if (!integer_zerop (off
))
3164 base
= size_int (pbitpos
/ BITS_PER_UNIT
);
3166 /* Otherwise put base + constant offset into the loop invariant BASE
3167 and continue with OFF. */
3170 base
= fold_convert (sizetype
, base
);
3171 base
= size_binop (PLUS_EXPR
, base
, size_int (pbitpos
/ BITS_PER_UNIT
));
3174 /* OFF at this point may be either a SSA_NAME or some tree expression
3175 from get_inner_reference. Try to peel off loop invariants from it
3176 into BASE as long as possible. */
3178 while (offtype
== NULL_TREE
)
3180 enum tree_code code
;
3181 tree op0
, op1
, add
= NULL_TREE
;
3183 if (TREE_CODE (off
) == SSA_NAME
)
3185 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3187 if (expr_invariant_in_loop_p (loop
, off
))
3190 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3193 op0
= gimple_assign_rhs1 (def_stmt
);
3194 code
= gimple_assign_rhs_code (def_stmt
);
3195 op1
= gimple_assign_rhs2 (def_stmt
);
3199 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3201 code
= TREE_CODE (off
);
3202 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3206 case POINTER_PLUS_EXPR
:
3208 if (expr_invariant_in_loop_p (loop
, op0
))
3213 add
= fold_convert (sizetype
, add
);
3215 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3216 base
= size_binop (PLUS_EXPR
, base
, add
);
3219 if (expr_invariant_in_loop_p (loop
, op1
))
3227 if (expr_invariant_in_loop_p (loop
, op1
))
3229 add
= fold_convert (sizetype
, op1
);
3230 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3236 if (scale
== 1 && tree_fits_shwi_p (op1
))
3238 scale
= tree_to_shwi (op1
);
3247 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3248 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3250 if (TYPE_PRECISION (TREE_TYPE (op0
))
3251 == TYPE_PRECISION (TREE_TYPE (off
)))
3256 if (TYPE_PRECISION (TREE_TYPE (op0
))
3257 < TYPE_PRECISION (TREE_TYPE (off
)))
3260 offtype
= TREE_TYPE (off
);
3271 /* If at the end OFF still isn't a SSA_NAME or isn't
3272 defined in the loop, punt. */
3273 if (TREE_CODE (off
) != SSA_NAME
3274 || expr_invariant_in_loop_p (loop
, off
))
3277 if (offtype
== NULL_TREE
)
3278 offtype
= TREE_TYPE (off
);
3280 if (DR_IS_READ (dr
))
3281 decl
= targetm
.vectorize
.builtin_gather (STMT_VINFO_VECTYPE (stmt_info
),
3284 decl
= targetm
.vectorize
.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info
),
3287 if (decl
== NULL_TREE
)
3299 /* Function vect_analyze_data_refs.
3301 Find all the data references in the loop or basic block.
3303 The general structure of the analysis of data refs in the vectorizer is as
3305 1- vect_analyze_data_refs(loop/bb): call
3306 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3307 in the loop/bb and their dependences.
3308 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3309 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3310 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3315 vect_analyze_data_refs (vec_info
*vinfo
, int *min_vf
)
3317 struct loop
*loop
= NULL
;
3319 struct data_reference
*dr
;
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE
, vect_location
,
3324 "=== vect_analyze_data_refs ===\n");
3326 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3327 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3329 /* Go through the data-refs, check that the analysis succeeded. Update
3330 pointer from stmt_vec_info struct to DR and vectype. */
3332 vec
<data_reference_p
> datarefs
= vinfo
->datarefs
;
3333 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
3336 stmt_vec_info stmt_info
;
3337 tree base
, offset
, init
;
3338 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
3339 bool simd_lane_access
= false;
3343 if (!dr
|| !DR_REF (dr
))
3345 if (dump_enabled_p ())
3346 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3347 "not vectorized: unhandled data-ref\n");
3351 stmt
= DR_STMT (dr
);
3352 stmt_info
= vinfo_for_stmt (stmt
);
3354 /* Discard clobbers from the dataref vector. We will remove
3355 clobber stmts during vectorization. */
3356 if (gimple_clobber_p (stmt
))
3359 if (i
== datarefs
.length () - 1)
3364 datarefs
.ordered_remove (i
);
3369 /* Check that analysis of the data-ref succeeded. */
3370 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
3375 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3376 && targetm
.vectorize
.builtin_gather
!= NULL
;
3379 && !TREE_THIS_VOLATILE (DR_REF (dr
))
3380 && targetm
.vectorize
.builtin_scatter
!= NULL
;
3381 bool maybe_simd_lane_access
3382 = is_a
<loop_vec_info
> (vinfo
) && loop
->simduid
;
3384 /* If target supports vector gather loads or scatter stores, or if
3385 this might be a SIMD lane access, see if they can't be used. */
3386 if (is_a
<loop_vec_info
> (vinfo
)
3387 && (maybe_gather
|| maybe_scatter
|| maybe_simd_lane_access
)
3388 && !nested_in_vect_loop_p (loop
, stmt
))
3390 struct data_reference
*newdr
3391 = create_data_ref (NULL
, loop_containing_stmt (stmt
),
3392 DR_REF (dr
), stmt
, maybe_scatter
? false : true);
3393 gcc_assert (newdr
!= NULL
&& DR_REF (newdr
));
3394 if (DR_BASE_ADDRESS (newdr
)
3395 && DR_OFFSET (newdr
)
3398 && integer_zerop (DR_STEP (newdr
)))
3400 if (maybe_simd_lane_access
)
3402 tree off
= DR_OFFSET (newdr
);
3404 if (TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
3405 && TREE_CODE (off
) == MULT_EXPR
3406 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
3408 tree step
= TREE_OPERAND (off
, 1);
3409 off
= TREE_OPERAND (off
, 0);
3411 if (CONVERT_EXPR_P (off
)
3412 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
,
3414 < TYPE_PRECISION (TREE_TYPE (off
)))
3415 off
= TREE_OPERAND (off
, 0);
3416 if (TREE_CODE (off
) == SSA_NAME
)
3418 gimple
*def
= SSA_NAME_DEF_STMT (off
);
3419 tree reft
= TREE_TYPE (DR_REF (newdr
));
3420 if (is_gimple_call (def
)
3421 && gimple_call_internal_p (def
)
3422 && (gimple_call_internal_fn (def
)
3423 == IFN_GOMP_SIMD_LANE
))
3425 tree arg
= gimple_call_arg (def
, 0);
3426 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
3427 arg
= SSA_NAME_VAR (arg
);
3428 if (arg
== loop
->simduid
3430 && tree_int_cst_equal
3431 (TYPE_SIZE_UNIT (reft
),
3434 DR_OFFSET (newdr
) = ssize_int (0);
3435 DR_STEP (newdr
) = step
;
3436 DR_ALIGNED_TO (newdr
)
3437 = size_int (BIGGEST_ALIGNMENT
);
3439 simd_lane_access
= true;
3445 if (!simd_lane_access
&& (maybe_gather
|| maybe_scatter
))
3449 gatherscatter
= GATHER
;
3451 gatherscatter
= SCATTER
;
3454 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3455 free_data_ref (newdr
);
3458 if (gatherscatter
== SG_NONE
&& !simd_lane_access
)
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3463 "not vectorized: data ref analysis "
3465 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3466 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3469 if (is_a
<bb_vec_info
> (vinfo
))
3476 if (TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3480 "not vectorized: base addr of dr is a "
3483 if (is_a
<bb_vec_info
> (vinfo
))
3486 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3491 if (TREE_THIS_VOLATILE (DR_REF (dr
)))
3493 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3496 "not vectorized: volatile type ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3498 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3501 if (is_a
<bb_vec_info
> (vinfo
))
3507 if (stmt_can_throw_internal (stmt
))
3509 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3512 "not vectorized: statement can throw an "
3514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3515 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3518 if (is_a
<bb_vec_info
> (vinfo
))
3521 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3526 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
3527 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
3529 if (dump_enabled_p ())
3531 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3532 "not vectorized: statement is bitfield "
3534 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3535 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3538 if (is_a
<bb_vec_info
> (vinfo
))
3541 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3546 base
= unshare_expr (DR_BASE_ADDRESS (dr
));
3547 offset
= unshare_expr (DR_OFFSET (dr
));
3548 init
= unshare_expr (DR_INIT (dr
));
3550 if (is_gimple_call (stmt
)
3551 && (!gimple_call_internal_p (stmt
)
3552 || (gimple_call_internal_fn (stmt
) != IFN_MASK_LOAD
3553 && gimple_call_internal_fn (stmt
) != IFN_MASK_STORE
)))
3555 if (dump_enabled_p ())
3557 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3558 "not vectorized: dr in a call ");
3559 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3560 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3563 if (is_a
<bb_vec_info
> (vinfo
))
3566 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3571 /* Update DR field in stmt_vec_info struct. */
3573 /* If the dataref is in an inner-loop of the loop that is considered for
3574 for vectorization, we also want to analyze the access relative to
3575 the outer-loop (DR contains information only relative to the
3576 inner-most enclosing loop). We do that by building a reference to the
3577 first location accessed by the inner-loop, and analyze it relative to
3579 if (loop
&& nested_in_vect_loop_p (loop
, stmt
))
3581 tree outer_step
, outer_base
, outer_init
;
3582 HOST_WIDE_INT pbitsize
, pbitpos
;
3585 int punsignedp
, preversep
, pvolatilep
;
3586 affine_iv base_iv
, offset_iv
;
3589 /* Build a reference to the first location accessed by the
3590 inner-loop: *(BASE+INIT). (The first location is actually
3591 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3592 tree inner_base
= build_fold_indirect_ref
3593 (fold_build_pointer_plus (base
, init
));
3595 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_NOTE
, vect_location
,
3598 "analyze in outer-loop: ");
3599 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, inner_base
);
3600 dump_printf (MSG_NOTE
, "\n");
3603 outer_base
= get_inner_reference (inner_base
, &pbitsize
, &pbitpos
,
3604 &poffset
, &pmode
, &punsignedp
,
3605 &preversep
, &pvolatilep
, false);
3606 gcc_assert (outer_base
!= NULL_TREE
);
3608 if (pbitpos
% BITS_PER_UNIT
!= 0)
3610 if (dump_enabled_p ())
3611 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3612 "failed: bit offset alignment.\n");
3618 if (dump_enabled_p ())
3619 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3620 "failed: reverse storage order.\n");
3624 outer_base
= build_fold_addr_expr (outer_base
);
3625 if (!simple_iv (loop
, loop_containing_stmt (stmt
), outer_base
,
3628 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3630 "failed: evolution of base is not affine.\n");
3637 poffset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
), offset
,
3645 offset_iv
.base
= ssize_int (0);
3646 offset_iv
.step
= ssize_int (0);
3648 else if (!simple_iv (loop
, loop_containing_stmt (stmt
), poffset
,
3651 if (dump_enabled_p ())
3652 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3653 "evolution of offset is not affine.\n");
3657 outer_init
= ssize_int (pbitpos
/ BITS_PER_UNIT
);
3658 split_constant_offset (base_iv
.base
, &base_iv
.base
, &dinit
);
3659 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3660 split_constant_offset (offset_iv
.base
, &offset_iv
.base
, &dinit
);
3661 outer_init
= size_binop (PLUS_EXPR
, outer_init
, dinit
);
3663 outer_step
= size_binop (PLUS_EXPR
,
3664 fold_convert (ssizetype
, base_iv
.step
),
3665 fold_convert (ssizetype
, offset_iv
.step
));
3667 STMT_VINFO_DR_STEP (stmt_info
) = outer_step
;
3668 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3669 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
) = base_iv
.base
;
3670 STMT_VINFO_DR_INIT (stmt_info
) = outer_init
;
3671 STMT_VINFO_DR_OFFSET (stmt_info
) =
3672 fold_convert (ssizetype
, offset_iv
.base
);
3673 STMT_VINFO_DR_ALIGNED_TO (stmt_info
) =
3674 size_int (highest_pow2_factor (offset_iv
.base
));
3676 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_NOTE
, vect_location
,
3679 "\touter base_address: ");
3680 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3681 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3682 dump_printf (MSG_NOTE
, "\n\touter offset from base address: ");
3683 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3684 STMT_VINFO_DR_OFFSET (stmt_info
));
3685 dump_printf (MSG_NOTE
,
3686 "\n\touter constant offset from base address: ");
3687 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3688 STMT_VINFO_DR_INIT (stmt_info
));
3689 dump_printf (MSG_NOTE
, "\n\touter step: ");
3690 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3691 STMT_VINFO_DR_STEP (stmt_info
));
3692 dump_printf (MSG_NOTE
, "\n\touter aligned to: ");
3693 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3694 STMT_VINFO_DR_ALIGNED_TO (stmt_info
));
3695 dump_printf (MSG_NOTE
, "\n");
3699 if (STMT_VINFO_DATA_REF (stmt_info
))
3701 if (dump_enabled_p ())
3703 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3704 "not vectorized: more than one data ref "
3706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3707 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3710 if (is_a
<bb_vec_info
> (vinfo
))
3713 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3718 STMT_VINFO_DATA_REF (stmt_info
) = dr
;
3719 if (simd_lane_access
)
3721 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
) = true;
3722 free_data_ref (datarefs
[i
]);
3726 /* Set vectype for STMT. */
3727 scalar_type
= TREE_TYPE (DR_REF (dr
));
3728 STMT_VINFO_VECTYPE (stmt_info
)
3729 = get_vectype_for_scalar_type (scalar_type
);
3730 if (!STMT_VINFO_VECTYPE (stmt_info
))
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3735 "not vectorized: no vectype for stmt: ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3737 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
3738 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
3740 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3743 if (is_a
<bb_vec_info
> (vinfo
))
3745 /* No vector type is fine, the ref can still participate
3746 in dependence analysis, we just can't vectorize it. */
3747 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
3751 if (gatherscatter
!= SG_NONE
|| simd_lane_access
)
3753 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3754 if (gatherscatter
!= SG_NONE
)
3761 if (dump_enabled_p ())
3763 dump_printf_loc (MSG_NOTE
, vect_location
,
3764 "got vectype for stmt: ");
3765 dump_gimple_stmt (MSG_NOTE
, TDF_SLIM
, stmt
, 0);
3766 dump_generic_expr (MSG_NOTE
, TDF_SLIM
,
3767 STMT_VINFO_VECTYPE (stmt_info
));
3768 dump_printf (MSG_NOTE
, "\n");
3772 /* Adjust the minimal vectorization factor according to the
3774 vf
= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info
));
3778 if (gatherscatter
!= SG_NONE
)
3781 if (!vect_check_gather_scatter (stmt
, as_a
<loop_vec_info
> (vinfo
),
3783 || get_vectype_for_scalar_type (TREE_TYPE (off
)) == NULL_TREE
)
3785 STMT_VINFO_DATA_REF (stmt_info
) = NULL
;
3787 if (dump_enabled_p ())
3789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3790 (gatherscatter
== GATHER
) ?
3791 "not vectorized: not suitable for gather "
3793 "not vectorized: not suitable for scatter "
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3796 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3802 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
3805 else if (is_a
<loop_vec_info
> (vinfo
)
3806 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
3808 if (nested_in_vect_loop_p (loop
, stmt
))
3810 if (dump_enabled_p ())
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3813 "not vectorized: not suitable for strided "
3815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION
, TDF_SLIM
, stmt
, 0);
3816 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
3820 STMT_VINFO_STRIDED_P (stmt_info
) = true;
3824 /* If we stopped analysis at the first dataref we could not analyze
3825 when trying to vectorize a basic-block mark the rest of the datarefs
3826 as not vectorizable and truncate the vector of datarefs. That
3827 avoids spending useless time in analyzing their dependence. */
3828 if (i
!= datarefs
.length ())
3830 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
3831 for (unsigned j
= i
; j
< datarefs
.length (); ++j
)
3833 data_reference_p dr
= datarefs
[j
];
3834 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr
))) = false;
3837 datarefs
.truncate (i
);
3844 /* Function vect_get_new_vect_var.
3846 Returns a name for a new variable. The current naming scheme appends the
3847 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3848 the name of vectorizer generated variables, and appends that to NAME if
3852 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
3859 case vect_simple_var
:
3862 case vect_scalar_var
:
3865 case vect_pointer_var
:
3874 char* tmp
= concat (prefix
, "_", name
, NULL
);
3875 new_vect_var
= create_tmp_reg (type
, tmp
);
3879 new_vect_var
= create_tmp_reg (type
, prefix
);
3881 return new_vect_var
;
3884 /* Like vect_get_new_vect_var but return an SSA name. */
3887 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
3894 case vect_simple_var
:
3897 case vect_scalar_var
:
3900 case vect_pointer_var
:
3909 char* tmp
= concat (prefix
, "_", name
, NULL
);
3910 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
3914 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
3916 return new_vect_var
;
3919 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3922 vect_duplicate_ssa_name_ptr_info (tree name
, data_reference
*dr
,
3923 stmt_vec_info stmt_info
)
3925 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr
));
3926 unsigned int align
= TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info
));
3927 int misalign
= DR_MISALIGNMENT (dr
);
3929 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
3931 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
), align
, misalign
);
3934 /* Function vect_create_addr_base_for_vector_ref.
3936 Create an expression that computes the address of the first memory location
3937 that will be accessed for a data reference.
3940 STMT: The statement containing the data reference.
3941 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3942 OFFSET: Optional. If supplied, it is be added to the initial address.
3943 LOOP: Specify relative to which loop-nest should the address be computed.
3944 For example, when the dataref is in an inner-loop nested in an
3945 outer-loop that is now being vectorized, LOOP can be either the
3946 outer-loop, or the inner-loop. The first memory location accessed
3947 by the following dataref ('in' points to short):
3954 if LOOP=i_loop: &in (relative to i_loop)
3955 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3956 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3957 initial address. Unlike OFFSET, which is number of elements to
3958 be added, BYTE_OFFSET is measured in bytes.
3961 1. Return an SSA_NAME whose value is the address of the memory location of
3962 the first vector of the data reference.
3963 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3964 these statement(s) which define the returned SSA_NAME.
3966 FORNOW: We are only handling array accesses with step 1. */
3969 vect_create_addr_base_for_vector_ref (gimple
*stmt
,
3970 gimple_seq
*new_stmt_list
,
3975 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
3976 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3978 const char *base_name
;
3981 gimple_seq seq
= NULL
;
3985 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
3986 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
3988 if (loop_vinfo
&& loop
&& loop
!= (gimple_bb (stmt
))->loop_father
)
3990 struct loop
*outer_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3992 gcc_assert (nested_in_vect_loop_p (outer_loop
, stmt
));
3994 data_ref_base
= unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info
));
3995 base_offset
= unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info
));
3996 init
= unshare_expr (STMT_VINFO_DR_INIT (stmt_info
));
4000 data_ref_base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4001 base_offset
= unshare_expr (DR_OFFSET (dr
));
4002 init
= unshare_expr (DR_INIT (dr
));
4006 base_name
= get_name (data_ref_base
);
4009 base_offset
= ssize_int (0);
4010 init
= ssize_int (0);
4011 base_name
= get_name (DR_REF (dr
));
4014 /* Create base_offset */
4015 base_offset
= size_binop (PLUS_EXPR
,
4016 fold_convert (sizetype
, base_offset
),
4017 fold_convert (sizetype
, init
));
4021 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4022 fold_convert (sizetype
, offset
), step
);
4023 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4024 base_offset
, offset
);
4028 byte_offset
= fold_convert (sizetype
, byte_offset
);
4029 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4030 base_offset
, byte_offset
);
4033 /* base + base_offset */
4035 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4038 addr_base
= build1 (ADDR_EXPR
,
4039 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4040 unshare_expr (DR_REF (dr
)));
4043 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4044 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4045 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4046 gimple_seq_add_seq (new_stmt_list
, seq
);
4048 if (DR_PTR_INFO (dr
)
4049 && TREE_CODE (addr_base
) == SSA_NAME
4050 && !SSA_NAME_PTR_INFO (addr_base
))
4052 vect_duplicate_ssa_name_ptr_info (addr_base
, dr
, stmt_info
);
4053 if (offset
|| byte_offset
)
4054 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4057 if (dump_enabled_p ())
4059 dump_printf_loc (MSG_NOTE
, vect_location
, "created ");
4060 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, addr_base
);
4061 dump_printf (MSG_NOTE
, "\n");
4068 /* Function vect_create_data_ref_ptr.
4070 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4071 location accessed in the loop by STMT, along with the def-use update
4072 chain to appropriately advance the pointer through the loop iterations.
4073 Also set aliasing information for the pointer. This pointer is used by
4074 the callers to this function to create a memory reference expression for
4075 vector load/store access.
4078 1. STMT: a stmt that references memory. Expected to be of the form
4079 GIMPLE_ASSIGN <name, data-ref> or
4080 GIMPLE_ASSIGN <data-ref, name>.
4081 2. AGGR_TYPE: the type of the reference, which should be either a vector
4083 3. AT_LOOP: the loop where the vector memref is to be created.
4084 4. OFFSET (optional): an offset to be added to the initial address accessed
4085 by the data-ref in STMT.
4086 5. BSI: location where the new stmts are to be placed if there is no loop
4087 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4088 pointing to the initial address.
4089 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4090 to the initial address accessed by the data-ref in STMT. This is
4091 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4095 1. Declare a new ptr to vector_type, and have it point to the base of the
4096 data reference (initial addressed accessed by the data reference).
4097 For example, for vector of type V8HI, the following code is generated:
4100 ap = (v8hi *)initial_address;
4102 if OFFSET is not supplied:
4103 initial_address = &a[init];
4104 if OFFSET is supplied:
4105 initial_address = &a[init + OFFSET];
4106 if BYTE_OFFSET is supplied:
4107 initial_address = &a[init] + BYTE_OFFSET;
4109 Return the initial_address in INITIAL_ADDRESS.
4111 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4112 update the pointer in each iteration of the loop.
4114 Return the increment stmt that updates the pointer in PTR_INCR.
4116 3. Set INV_P to true if the access pattern of the data reference in the
4117 vectorized loop is invariant. Set it to false otherwise.
4119 4. Return the pointer. */
4122 vect_create_data_ref_ptr (gimple
*stmt
, tree aggr_type
, struct loop
*at_loop
,
4123 tree offset
, tree
*initial_address
,
4124 gimple_stmt_iterator
*gsi
, gimple
**ptr_incr
,
4125 bool only_init
, bool *inv_p
, tree byte_offset
)
4127 const char *base_name
;
4128 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4129 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4130 struct loop
*loop
= NULL
;
4131 bool nested_in_vect_loop
= false;
4132 struct loop
*containing_loop
= NULL
;
4136 gimple_seq new_stmt_list
= NULL
;
4140 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4142 gimple_stmt_iterator incr_gsi
;
4144 tree indx_before_incr
, indx_after_incr
;
4147 bb_vec_info bb_vinfo
= STMT_VINFO_BB_VINFO (stmt_info
);
4149 gcc_assert (TREE_CODE (aggr_type
) == ARRAY_TYPE
4150 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4154 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4155 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4156 containing_loop
= (gimple_bb (stmt
))->loop_father
;
4157 pe
= loop_preheader_edge (loop
);
4161 gcc_assert (bb_vinfo
);
4166 /* Check the step (evolution) of the load in LOOP, and record
4167 whether it's invariant. */
4168 if (nested_in_vect_loop
)
4169 step
= STMT_VINFO_DR_STEP (stmt_info
);
4171 step
= DR_STEP (STMT_VINFO_DATA_REF (stmt_info
));
4173 if (integer_zerop (step
))
4178 /* Create an expression for the first address accessed by this load
4180 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4182 if (dump_enabled_p ())
4184 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4185 dump_printf_loc (MSG_NOTE
, vect_location
,
4186 "create %s-pointer variable to type: ",
4187 get_tree_code_name (TREE_CODE (aggr_type
)));
4188 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, aggr_type
);
4189 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4190 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4191 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4192 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4193 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4194 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4196 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4197 dump_generic_expr (MSG_NOTE
, TDF_SLIM
, DR_BASE_OBJECT (dr
));
4198 dump_printf (MSG_NOTE
, "\n");
4201 /* (1) Create the new aggregate-pointer variable.
4202 Vector and array types inherit the alias set of their component
4203 type by default so we need to use a ref-all pointer if the data
4204 reference does not conflict with the created aggregated data
4205 reference because it is not addressable. */
4206 bool need_ref_all
= false;
4207 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4208 get_alias_set (DR_REF (dr
))))
4209 need_ref_all
= true;
4210 /* Likewise for any of the data references in the stmt group. */
4211 else if (STMT_VINFO_GROUP_SIZE (stmt_info
) > 1)
4213 gimple
*orig_stmt
= STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info
);
4216 stmt_vec_info sinfo
= vinfo_for_stmt (orig_stmt
);
4217 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4218 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4219 get_alias_set (DR_REF (sdr
))))
4221 need_ref_all
= true;
4224 orig_stmt
= STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo
);
4228 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4230 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4233 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4234 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4235 def-use update cycles for the pointer: one relative to the outer-loop
4236 (LOOP), which is what steps (3) and (4) below do. The other is relative
4237 to the inner-loop (which is the inner-most loop containing the dataref),
4238 and this is done be step (5) below.
4240 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4241 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4242 redundant. Steps (3),(4) create the following:
4245 LOOP: vp1 = phi(vp0,vp2)
4251 If there is an inner-loop nested in loop, then step (5) will also be
4252 applied, and an additional update in the inner-loop will be created:
4255 LOOP: vp1 = phi(vp0,vp2)
4257 inner: vp3 = phi(vp1,vp4)
4258 vp4 = vp3 + inner_step
4264 /* (2) Calculate the initial address of the aggregate-pointer, and set
4265 the aggregate-pointer to point to it before the loop. */
4267 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4269 new_temp
= vect_create_addr_base_for_vector_ref (stmt
, &new_stmt_list
,
4270 offset
, loop
, byte_offset
);
4275 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4276 gcc_assert (!new_bb
);
4279 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4282 *initial_address
= new_temp
;
4283 aggr_ptr_init
= new_temp
;
4285 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4286 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4287 inner-loop nested in LOOP (during outer-loop vectorization). */
4289 /* No update in loop is required. */
4290 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4291 aptr
= aggr_ptr_init
;
4294 /* The step of the aggregate pointer is the type size. */
4295 tree iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4296 /* One exception to the above is when the scalar step of the load in
4297 LOOP is zero. In this case the step here is also zero. */
4299 iv_step
= size_zero_node
;
4300 else if (tree_int_cst_sgn (step
) == -1)
4301 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4303 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4305 create_iv (aggr_ptr_init
,
4306 fold_convert (aggr_ptr_type
, iv_step
),
4307 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4308 &indx_before_incr
, &indx_after_incr
);
4309 incr
= gsi_stmt (incr_gsi
);
4310 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4312 /* Copy the points-to information if it exists. */
4313 if (DR_PTR_INFO (dr
))
4315 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4316 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4321 aptr
= indx_before_incr
;
4324 if (!nested_in_vect_loop
|| only_init
)
4328 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4329 nested in LOOP, if exists. */
4331 gcc_assert (nested_in_vect_loop
);
4334 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4336 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4337 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4339 incr
= gsi_stmt (incr_gsi
);
4340 set_vinfo_for_stmt (incr
, new_stmt_vec_info (incr
, loop_vinfo
));
4342 /* Copy the points-to information if it exists. */
4343 if (DR_PTR_INFO (dr
))
4345 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr
, stmt_info
);
4346 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr
, stmt_info
);
4351 return indx_before_incr
;
4358 /* Function bump_vector_ptr
4360 Increment a pointer (to a vector type) by vector-size. If requested,
4361 i.e. if PTR-INCR is given, then also connect the new increment stmt
4362 to the existing def-use update-chain of the pointer, by modifying
4363 the PTR_INCR as illustrated below:
4365 The pointer def-use update-chain before this function:
4366 DATAREF_PTR = phi (p_0, p_2)
4368 PTR_INCR: p_2 = DATAREF_PTR + step
4370 The pointer def-use update-chain after this function:
4371 DATAREF_PTR = phi (p_0, p_2)
4373 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4375 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4378 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4380 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4381 the loop. The increment amount across iterations is expected
4383 BSI - location where the new update stmt is to be placed.
4384 STMT - the original scalar memory-access stmt that is being vectorized.
4385 BUMP - optional. The offset by which to bump the pointer. If not given,
4386 the offset is assumed to be vector_size.
4388 Output: Return NEW_DATAREF_PTR as illustrated above.
4393 bump_vector_ptr (tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
4394 gimple
*stmt
, tree bump
)
4396 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4397 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4398 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4399 tree update
= TYPE_SIZE_UNIT (vectype
);
4402 use_operand_p use_p
;
4403 tree new_dataref_ptr
;
4408 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
4409 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
4411 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
4412 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
4413 dataref_ptr
, update
);
4414 vect_finish_stmt_generation (stmt
, incr_stmt
, gsi
);
4416 /* Copy the points-to information if it exists. */
4417 if (DR_PTR_INFO (dr
))
4419 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
4420 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
4424 return new_dataref_ptr
;
4426 /* Update the vector-pointer's cross-iteration increment. */
4427 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
4429 tree use
= USE_FROM_PTR (use_p
);
4431 if (use
== dataref_ptr
)
4432 SET_USE (use_p
, new_dataref_ptr
);
4434 gcc_assert (tree_int_cst_compare (use
, update
) == 0);
4437 return new_dataref_ptr
;
4441 /* Function vect_create_destination_var.
4443 Create a new temporary of type VECTYPE. */
4446 vect_create_destination_var (tree scalar_dest
, tree vectype
)
4452 enum vect_var_kind kind
;
4454 kind
= vectype
? vect_simple_var
: vect_scalar_var
;
4455 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
4457 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
4459 name
= get_name (scalar_dest
);
4461 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
4463 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
4464 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
4470 /* Function vect_grouped_store_supported.
4472 Returns TRUE if interleave high and interleave low permutations
4473 are supported, and FALSE otherwise. */
4476 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4478 machine_mode mode
= TYPE_MODE (vectype
);
4480 /* vect_permute_store_chain requires the group size to be equal to 3 or
4481 be a power of two. */
4482 if (count
!= 3 && exact_log2 (count
) == -1)
4484 if (dump_enabled_p ())
4485 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4486 "the size of the group of accesses"
4487 " is not a power of 2 or not eqaul to 3\n");
4491 /* Check that the permutation is supported. */
4492 if (VECTOR_MODE_P (mode
))
4494 unsigned int i
, nelt
= GET_MODE_NUNITS (mode
);
4495 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4499 unsigned int j0
= 0, j1
= 0, j2
= 0;
4502 for (j
= 0; j
< 3; j
++)
4504 int nelt0
= ((3 - j
) * nelt
) % 3;
4505 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4506 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4507 for (i
= 0; i
< nelt
; i
++)
4509 if (3 * i
+ nelt0
< nelt
)
4510 sel
[3 * i
+ nelt0
] = j0
++;
4511 if (3 * i
+ nelt1
< nelt
)
4512 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4513 if (3 * i
+ nelt2
< nelt
)
4514 sel
[3 * i
+ nelt2
] = 0;
4516 if (!can_vec_perm_p (mode
, false, sel
))
4518 if (dump_enabled_p ())
4519 dump_printf (MSG_MISSED_OPTIMIZATION
,
4520 "permutaion op not supported by target.\n");
4524 for (i
= 0; i
< nelt
; i
++)
4526 if (3 * i
+ nelt0
< nelt
)
4527 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4528 if (3 * i
+ nelt1
< nelt
)
4529 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4530 if (3 * i
+ nelt2
< nelt
)
4531 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4533 if (!can_vec_perm_p (mode
, false, sel
))
4535 if (dump_enabled_p ())
4536 dump_printf (MSG_MISSED_OPTIMIZATION
,
4537 "permutaion op not supported by target.\n");
4545 /* If length is not equal to 3 then only power of 2 is supported. */
4546 gcc_assert (exact_log2 (count
) != -1);
4548 for (i
= 0; i
< nelt
/ 2; i
++)
4551 sel
[i
* 2 + 1] = i
+ nelt
;
4553 if (can_vec_perm_p (mode
, false, sel
))
4555 for (i
= 0; i
< nelt
; i
++)
4557 if (can_vec_perm_p (mode
, false, sel
))
4563 if (dump_enabled_p ())
4564 dump_printf (MSG_MISSED_OPTIMIZATION
,
4565 "permutaion op not supported by target.\n");
4570 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4574 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
4576 return vect_lanes_optab_supported_p ("vec_store_lanes",
4577 vec_store_lanes_optab
,
4582 /* Function vect_permute_store_chain.
4584 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4585 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4586 the data correctly for the stores. Return the final references for stores
4589 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4590 The input is 4 vectors each containing 8 elements. We assign a number to
4591 each element, the input sequence is:
4593 1st vec: 0 1 2 3 4 5 6 7
4594 2nd vec: 8 9 10 11 12 13 14 15
4595 3rd vec: 16 17 18 19 20 21 22 23
4596 4th vec: 24 25 26 27 28 29 30 31
4598 The output sequence should be:
4600 1st vec: 0 8 16 24 1 9 17 25
4601 2nd vec: 2 10 18 26 3 11 19 27
4602 3rd vec: 4 12 20 28 5 13 21 30
4603 4th vec: 6 14 22 30 7 15 23 31
4605 i.e., we interleave the contents of the four vectors in their order.
4607 We use interleave_high/low instructions to create such output. The input of
4608 each interleave_high/low operation is two vectors:
4611 the even elements of the result vector are obtained left-to-right from the
4612 high/low elements of the first vector. The odd elements of the result are
4613 obtained left-to-right from the high/low elements of the second vector.
4614 The output of interleave_high will be: 0 4 1 5
4615 and of interleave_low: 2 6 3 7
4618 The permutation is done in log LENGTH stages. In each stage interleave_high
4619 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4620 where the first argument is taken from the first half of DR_CHAIN and the
4621 second argument from it's second half.
4624 I1: interleave_high (1st vec, 3rd vec)
4625 I2: interleave_low (1st vec, 3rd vec)
4626 I3: interleave_high (2nd vec, 4th vec)
4627 I4: interleave_low (2nd vec, 4th vec)
4629 The output for the first stage is:
4631 I1: 0 16 1 17 2 18 3 19
4632 I2: 4 20 5 21 6 22 7 23
4633 I3: 8 24 9 25 10 26 11 27
4634 I4: 12 28 13 29 14 30 15 31
4636 The output of the second stage, i.e. the final result is:
4638 I1: 0 8 16 24 1 9 17 25
4639 I2: 2 10 18 26 3 11 19 27
4640 I3: 4 12 20 28 5 13 21 30
4641 I4: 6 14 22 30 7 15 23 31. */
4644 vect_permute_store_chain (vec
<tree
> dr_chain
,
4645 unsigned int length
,
4647 gimple_stmt_iterator
*gsi
,
4648 vec
<tree
> *result_chain
)
4650 tree vect1
, vect2
, high
, low
;
4652 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
4653 tree perm_mask_low
, perm_mask_high
;
4655 tree perm3_mask_low
, perm3_mask_high
;
4656 unsigned int i
, n
, log_length
= exact_log2 (length
);
4657 unsigned int j
, nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
4658 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
4660 result_chain
->quick_grow (length
);
4661 memcpy (result_chain
->address (), dr_chain
.address (),
4662 length
* sizeof (tree
));
4666 unsigned int j0
= 0, j1
= 0, j2
= 0;
4668 for (j
= 0; j
< 3; j
++)
4670 int nelt0
= ((3 - j
) * nelt
) % 3;
4671 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
4672 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
4674 for (i
= 0; i
< nelt
; i
++)
4676 if (3 * i
+ nelt0
< nelt
)
4677 sel
[3 * i
+ nelt0
] = j0
++;
4678 if (3 * i
+ nelt1
< nelt
)
4679 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
4680 if (3 * i
+ nelt2
< nelt
)
4681 sel
[3 * i
+ nelt2
] = 0;
4683 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4685 for (i
= 0; i
< nelt
; i
++)
4687 if (3 * i
+ nelt0
< nelt
)
4688 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
4689 if (3 * i
+ nelt1
< nelt
)
4690 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
4691 if (3 * i
+ nelt2
< nelt
)
4692 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
4694 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4696 vect1
= dr_chain
[0];
4697 vect2
= dr_chain
[1];
4699 /* Create interleaving stmt:
4700 low = VEC_PERM_EXPR <vect1, vect2,
4701 {j, nelt, *, j + 1, nelt + j + 1, *,
4702 j + 2, nelt + j + 2, *, ...}> */
4703 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
4704 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4705 vect2
, perm3_mask_low
);
4706 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4709 vect2
= dr_chain
[2];
4710 /* Create interleaving stmt:
4711 low = VEC_PERM_EXPR <vect1, vect2,
4712 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4713 6, 7, nelt + j + 2, ...}> */
4714 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
4715 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
4716 vect2
, perm3_mask_high
);
4717 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4718 (*result_chain
)[j
] = data_ref
;
4723 /* If length is not equal to 3 then only power of 2 is supported. */
4724 gcc_assert (exact_log2 (length
) != -1);
4726 for (i
= 0, n
= nelt
/ 2; i
< n
; i
++)
4729 sel
[i
* 2 + 1] = i
+ nelt
;
4731 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
4733 for (i
= 0; i
< nelt
; i
++)
4735 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
4737 for (i
= 0, n
= log_length
; i
< n
; i
++)
4739 for (j
= 0; j
< length
/2; j
++)
4741 vect1
= dr_chain
[j
];
4742 vect2
= dr_chain
[j
+length
/2];
4744 /* Create interleaving stmt:
4745 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4747 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
4748 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
4749 vect2
, perm_mask_high
);
4750 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4751 (*result_chain
)[2*j
] = high
;
4753 /* Create interleaving stmt:
4754 low = VEC_PERM_EXPR <vect1, vect2,
4755 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4757 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
4758 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
4759 vect2
, perm_mask_low
);
4760 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
4761 (*result_chain
)[2*j
+1] = low
;
4763 memcpy (dr_chain
.address (), result_chain
->address (),
4764 length
* sizeof (tree
));
4769 /* Function vect_setup_realignment
4771 This function is called when vectorizing an unaligned load using
4772 the dr_explicit_realign[_optimized] scheme.
4773 This function generates the following code at the loop prolog:
4776 x msq_init = *(floor(p)); # prolog load
4777 realignment_token = call target_builtin;
4779 x msq = phi (msq_init, ---)
4781 The stmts marked with x are generated only for the case of
4782 dr_explicit_realign_optimized.
4784 The code above sets up a new (vector) pointer, pointing to the first
4785 location accessed by STMT, and a "floor-aligned" load using that pointer.
4786 It also generates code to compute the "realignment-token" (if the relevant
4787 target hook was defined), and creates a phi-node at the loop-header bb
4788 whose arguments are the result of the prolog-load (created by this
4789 function) and the result of a load that takes place in the loop (to be
4790 created by the caller to this function).
4792 For the case of dr_explicit_realign_optimized:
4793 The caller to this function uses the phi-result (msq) to create the
4794 realignment code inside the loop, and sets up the missing phi argument,
4797 msq = phi (msq_init, lsq)
4798 lsq = *(floor(p')); # load in loop
4799 result = realign_load (msq, lsq, realignment_token);
4801 For the case of dr_explicit_realign:
4803 msq = *(floor(p)); # load in loop
4805 lsq = *(floor(p')); # load in loop
4806 result = realign_load (msq, lsq, realignment_token);
4809 STMT - (scalar) load stmt to be vectorized. This load accesses
4810 a memory location that may be unaligned.
4811 BSI - place where new code is to be inserted.
4812 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4816 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4817 target hook, if defined.
4818 Return value - the result of the loop-header phi node. */
4821 vect_setup_realignment (gimple
*stmt
, gimple_stmt_iterator
*gsi
,
4822 tree
*realignment_token
,
4823 enum dr_alignment_support alignment_support_scheme
,
4825 struct loop
**at_loop
)
4827 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
4828 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
4829 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
4830 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
4831 struct loop
*loop
= NULL
;
4833 tree scalar_dest
= gimple_assign_lhs (stmt
);
4839 tree msq_init
= NULL_TREE
;
4842 tree msq
= NULL_TREE
;
4843 gimple_seq stmts
= NULL
;
4845 bool compute_in_loop
= false;
4846 bool nested_in_vect_loop
= false;
4847 struct loop
*containing_loop
= (gimple_bb (stmt
))->loop_father
;
4848 struct loop
*loop_for_initial_load
= NULL
;
4852 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4853 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt
);
4856 gcc_assert (alignment_support_scheme
== dr_explicit_realign
4857 || alignment_support_scheme
== dr_explicit_realign_optimized
);
4859 /* We need to generate three things:
4860 1. the misalignment computation
4861 2. the extra vector load (for the optimized realignment scheme).
4862 3. the phi node for the two vectors from which the realignment is
4863 done (for the optimized realignment scheme). */
4865 /* 1. Determine where to generate the misalignment computation.
4867 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4868 calculation will be generated by this function, outside the loop (in the
4869 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4870 caller, inside the loop.
4872 Background: If the misalignment remains fixed throughout the iterations of
4873 the loop, then both realignment schemes are applicable, and also the
4874 misalignment computation can be done outside LOOP. This is because we are
4875 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4876 are a multiple of VS (the Vector Size), and therefore the misalignment in
4877 different vectorized LOOP iterations is always the same.
4878 The problem arises only if the memory access is in an inner-loop nested
4879 inside LOOP, which is now being vectorized using outer-loop vectorization.
4880 This is the only case when the misalignment of the memory access may not
4881 remain fixed throughout the iterations of the inner-loop (as explained in
4882 detail in vect_supportable_dr_alignment). In this case, not only is the
4883 optimized realignment scheme not applicable, but also the misalignment
4884 computation (and generation of the realignment token that is passed to
4885 REALIGN_LOAD) have to be done inside the loop.
4887 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4888 or not, which in turn determines if the misalignment is computed inside
4889 the inner-loop, or outside LOOP. */
4891 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
4893 compute_in_loop
= true;
4894 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
4898 /* 2. Determine where to generate the extra vector load.
4900 For the optimized realignment scheme, instead of generating two vector
4901 loads in each iteration, we generate a single extra vector load in the
4902 preheader of the loop, and in each iteration reuse the result of the
4903 vector load from the previous iteration. In case the memory access is in
4904 an inner-loop nested inside LOOP, which is now being vectorized using
4905 outer-loop vectorization, we need to determine whether this initial vector
4906 load should be generated at the preheader of the inner-loop, or can be
4907 generated at the preheader of LOOP. If the memory access has no evolution
4908 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4909 to be generated inside LOOP (in the preheader of the inner-loop). */
4911 if (nested_in_vect_loop
)
4913 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
4914 bool invariant_in_outerloop
=
4915 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
4916 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
4919 loop_for_initial_load
= loop
;
4921 *at_loop
= loop_for_initial_load
;
4923 if (loop_for_initial_load
)
4924 pe
= loop_preheader_edge (loop_for_initial_load
);
4926 /* 3. For the case of the optimized realignment, create the first vector
4927 load at the loop preheader. */
4929 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
4931 /* Create msq_init = *(floor(p1)) in the loop preheader */
4934 gcc_assert (!compute_in_loop
);
4935 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
4936 ptr
= vect_create_data_ref_ptr (stmt
, vectype
, loop_for_initial_load
,
4937 NULL_TREE
, &init_addr
, NULL
, &inc
,
4939 if (TREE_CODE (ptr
) == SSA_NAME
)
4940 new_temp
= copy_ssa_name (ptr
);
4942 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
4943 new_stmt
= gimple_build_assign
4944 (new_temp
, BIT_AND_EXPR
, ptr
,
4945 build_int_cst (TREE_TYPE (ptr
),
4946 -(HOST_WIDE_INT
)TYPE_ALIGN_UNIT (vectype
)));
4947 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4948 gcc_assert (!new_bb
);
4950 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
4951 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
4952 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
4953 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4954 gimple_assign_set_lhs (new_stmt
, new_temp
);
4957 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
4958 gcc_assert (!new_bb
);
4961 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
4963 msq_init
= gimple_assign_lhs (new_stmt
);
4966 /* 4. Create realignment token using a target builtin, if available.
4967 It is done either inside the containing loop, or before LOOP (as
4968 determined above). */
4970 if (targetm
.vectorize
.builtin_mask_for_load
)
4975 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4978 /* Generate the INIT_ADDR computation outside LOOP. */
4979 init_addr
= vect_create_addr_base_for_vector_ref (stmt
, &stmts
,
4983 pe
= loop_preheader_edge (loop
);
4984 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
4985 gcc_assert (!new_bb
);
4988 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
4991 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
4992 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
4994 vect_create_destination_var (scalar_dest
,
4995 gimple_call_return_type (new_stmt
));
4996 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
4997 gimple_call_set_lhs (new_stmt
, new_temp
);
4999 if (compute_in_loop
)
5000 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5003 /* Generate the misalignment computation outside LOOP. */
5004 pe
= loop_preheader_edge (loop
);
5005 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5006 gcc_assert (!new_bb
);
5009 *realignment_token
= gimple_call_lhs (new_stmt
);
5011 /* The result of the CALL_EXPR to this builtin is determined from
5012 the value of the parameter and no global variables are touched
5013 which makes the builtin a "const" function. Requiring the
5014 builtin to have the "const" attribute makes it unnecessary
5015 to call mark_call_clobbered. */
5016 gcc_assert (TREE_READONLY (builtin_decl
));
5019 if (alignment_support_scheme
== dr_explicit_realign
)
5022 gcc_assert (!compute_in_loop
);
5023 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5026 /* 5. Create msq = phi <msq_init, lsq> in loop */
5028 pe
= loop_preheader_edge (containing_loop
);
5029 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5030 msq
= make_ssa_name (vec_dest
);
5031 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5032 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5038 /* Function vect_grouped_load_supported.
5040 Returns TRUE if even and odd permutations are supported,
5041 and FALSE otherwise. */
5044 vect_grouped_load_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5046 machine_mode mode
= TYPE_MODE (vectype
);
5048 /* vect_permute_load_chain requires the group size to be equal to 3 or
5049 be a power of two. */
5050 if (count
!= 3 && exact_log2 (count
) == -1)
5052 if (dump_enabled_p ())
5053 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5054 "the size of the group of accesses"
5055 " is not a power of 2 or not equal to 3\n");
5059 /* Check that the permutation is supported. */
5060 if (VECTOR_MODE_P (mode
))
5062 unsigned int i
, j
, nelt
= GET_MODE_NUNITS (mode
);
5063 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5068 for (k
= 0; k
< 3; k
++)
5070 for (i
= 0; i
< nelt
; i
++)
5071 if (3 * i
+ k
< 2 * nelt
)
5075 if (!can_vec_perm_p (mode
, false, sel
))
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5079 "shuffle of 3 loads is not supported by"
5083 for (i
= 0, j
= 0; i
< nelt
; i
++)
5084 if (3 * i
+ k
< 2 * nelt
)
5087 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5088 if (!can_vec_perm_p (mode
, false, sel
))
5090 if (dump_enabled_p ())
5091 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5092 "shuffle of 3 loads is not supported by"
5101 /* If length is not equal to 3 then only power of 2 is supported. */
5102 gcc_assert (exact_log2 (count
) != -1);
5103 for (i
= 0; i
< nelt
; i
++)
5105 if (can_vec_perm_p (mode
, false, sel
))
5107 for (i
= 0; i
< nelt
; i
++)
5109 if (can_vec_perm_p (mode
, false, sel
))
5115 if (dump_enabled_p ())
5116 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5117 "extract even/odd not supported by target\n");
5121 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5125 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5127 return vect_lanes_optab_supported_p ("vec_load_lanes",
5128 vec_load_lanes_optab
,
5132 /* Function vect_permute_load_chain.
5134 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5135 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5136 the input data correctly. Return the final references for loads in
5139 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5140 The input is 4 vectors each containing 8 elements. We assign a number to each
5141 element, the input sequence is:
5143 1st vec: 0 1 2 3 4 5 6 7
5144 2nd vec: 8 9 10 11 12 13 14 15
5145 3rd vec: 16 17 18 19 20 21 22 23
5146 4th vec: 24 25 26 27 28 29 30 31
5148 The output sequence should be:
5150 1st vec: 0 4 8 12 16 20 24 28
5151 2nd vec: 1 5 9 13 17 21 25 29
5152 3rd vec: 2 6 10 14 18 22 26 30
5153 4th vec: 3 7 11 15 19 23 27 31
5155 i.e., the first output vector should contain the first elements of each
5156 interleaving group, etc.
5158 We use extract_even/odd instructions to create such output. The input of
5159 each extract_even/odd operation is two vectors
5163 and the output is the vector of extracted even/odd elements. The output of
5164 extract_even will be: 0 2 4 6
5165 and of extract_odd: 1 3 5 7
5168 The permutation is done in log LENGTH stages. In each stage extract_even
5169 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5170 their order. In our example,
5172 E1: extract_even (1st vec, 2nd vec)
5173 E2: extract_odd (1st vec, 2nd vec)
5174 E3: extract_even (3rd vec, 4th vec)
5175 E4: extract_odd (3rd vec, 4th vec)
5177 The output for the first stage will be:
5179 E1: 0 2 4 6 8 10 12 14
5180 E2: 1 3 5 7 9 11 13 15
5181 E3: 16 18 20 22 24 26 28 30
5182 E4: 17 19 21 23 25 27 29 31
5184 In order to proceed and create the correct sequence for the next stage (or
5185 for the correct output, if the second stage is the last one, as in our
5186 example), we first put the output of extract_even operation and then the
5187 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5188 The input for the second stage is:
5190 1st vec (E1): 0 2 4 6 8 10 12 14
5191 2nd vec (E3): 16 18 20 22 24 26 28 30
5192 3rd vec (E2): 1 3 5 7 9 11 13 15
5193 4th vec (E4): 17 19 21 23 25 27 29 31
5195 The output of the second stage:
5197 E1: 0 4 8 12 16 20 24 28
5198 E2: 2 6 10 14 18 22 26 30
5199 E3: 1 5 9 13 17 21 25 29
5200 E4: 3 7 11 15 19 23 27 31
5202 And RESULT_CHAIN after reordering:
5204 1st vec (E1): 0 4 8 12 16 20 24 28
5205 2nd vec (E3): 1 5 9 13 17 21 25 29
5206 3rd vec (E2): 2 6 10 14 18 22 26 30
5207 4th vec (E4): 3 7 11 15 19 23 27 31. */
5210 vect_permute_load_chain (vec
<tree
> dr_chain
,
5211 unsigned int length
,
5213 gimple_stmt_iterator
*gsi
,
5214 vec
<tree
> *result_chain
)
5216 tree data_ref
, first_vect
, second_vect
;
5217 tree perm_mask_even
, perm_mask_odd
;
5218 tree perm3_mask_low
, perm3_mask_high
;
5220 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5221 unsigned int i
, j
, log_length
= exact_log2 (length
);
5222 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5223 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5225 result_chain
->quick_grow (length
);
5226 memcpy (result_chain
->address (), dr_chain
.address (),
5227 length
* sizeof (tree
));
5233 for (k
= 0; k
< 3; k
++)
5235 for (i
= 0; i
< nelt
; i
++)
5236 if (3 * i
+ k
< 2 * nelt
)
5240 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, sel
);
5242 for (i
= 0, j
= 0; i
< nelt
; i
++)
5243 if (3 * i
+ k
< 2 * nelt
)
5246 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5248 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, sel
);
5250 first_vect
= dr_chain
[0];
5251 second_vect
= dr_chain
[1];
5253 /* Create interleaving stmt (low part of):
5254 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5256 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5257 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5258 second_vect
, perm3_mask_low
);
5259 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5261 /* Create interleaving stmt (high part of):
5262 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5264 first_vect
= data_ref
;
5265 second_vect
= dr_chain
[2];
5266 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5267 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
5268 second_vect
, perm3_mask_high
);
5269 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5270 (*result_chain
)[k
] = data_ref
;
5275 /* If length is not equal to 3 then only power of 2 is supported. */
5276 gcc_assert (exact_log2 (length
) != -1);
5278 for (i
= 0; i
< nelt
; ++i
)
5280 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, sel
);
5282 for (i
= 0; i
< nelt
; ++i
)
5284 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, sel
);
5286 for (i
= 0; i
< log_length
; i
++)
5288 for (j
= 0; j
< length
; j
+= 2)
5290 first_vect
= dr_chain
[j
];
5291 second_vect
= dr_chain
[j
+1];
5293 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5294 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
5295 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5296 first_vect
, second_vect
,
5298 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5299 (*result_chain
)[j
/2] = data_ref
;
5301 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5302 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
5303 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5304 first_vect
, second_vect
,
5306 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5307 (*result_chain
)[j
/2+length
/2] = data_ref
;
5309 memcpy (dr_chain
.address (), result_chain
->address (),
5310 length
* sizeof (tree
));
5315 /* Function vect_shift_permute_load_chain.
5317 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5318 sequence of stmts to reorder the input data accordingly.
5319 Return the final references for loads in RESULT_CHAIN.
5320 Return true if successed, false otherwise.
5322 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5323 The input is 3 vectors each containing 8 elements. We assign a
5324 number to each element, the input sequence is:
5326 1st vec: 0 1 2 3 4 5 6 7
5327 2nd vec: 8 9 10 11 12 13 14 15
5328 3rd vec: 16 17 18 19 20 21 22 23
5330 The output sequence should be:
5332 1st vec: 0 3 6 9 12 15 18 21
5333 2nd vec: 1 4 7 10 13 16 19 22
5334 3rd vec: 2 5 8 11 14 17 20 23
5336 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5338 First we shuffle all 3 vectors to get correct elements order:
5340 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5341 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5342 3rd vec: (16 19 22) (17 20 23) (18 21)
5344 Next we unite and shift vector 3 times:
5347 shift right by 6 the concatenation of:
5348 "1st vec" and "2nd vec"
5349 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5350 "2nd vec" and "3rd vec"
5351 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5352 "3rd vec" and "1st vec"
5353 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5356 So that now new vectors are:
5358 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5359 2nd vec: (10 13) (16 19 22) (17 20 23)
5360 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5363 shift right by 5 the concatenation of:
5364 "1st vec" and "3rd vec"
5365 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5366 "2nd vec" and "1st vec"
5367 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5368 "3rd vec" and "2nd vec"
5369 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5372 So that now new vectors are:
5374 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5375 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5376 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5379 shift right by 5 the concatenation of:
5380 "1st vec" and "1st vec"
5381 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5382 shift right by 3 the concatenation of:
5383 "2nd vec" and "2nd vec"
5384 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5387 So that now all vectors are READY:
5388 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5389 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5390 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5392 This algorithm is faster than one in vect_permute_load_chain if:
5393 1. "shift of a concatination" is faster than general permutation.
5395 2. The TARGET machine can't execute vector instructions in parallel.
5396 This is because each step of the algorithm depends on previous.
5397 The algorithm in vect_permute_load_chain is much more parallel.
5399 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5403 vect_shift_permute_load_chain (vec
<tree
> dr_chain
,
5404 unsigned int length
,
5406 gimple_stmt_iterator
*gsi
,
5407 vec
<tree
> *result_chain
)
5409 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
5410 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
5411 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
5414 tree vectype
= STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
));
5416 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5417 unsigned char *sel
= XALLOCAVEC (unsigned char, nelt
);
5418 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5419 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5421 result_chain
->quick_grow (length
);
5422 memcpy (result_chain
->address (), dr_chain
.address (),
5423 length
* sizeof (tree
));
5425 if (exact_log2 (length
) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 4)
5427 unsigned int j
, log_length
= exact_log2 (length
);
5428 for (i
= 0; i
< nelt
/ 2; ++i
)
5430 for (i
= 0; i
< nelt
/ 2; ++i
)
5431 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
5432 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5434 if (dump_enabled_p ())
5435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5436 "shuffle of 2 fields structure is not \
5437 supported by target\n");
5440 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, sel
);
5442 for (i
= 0; i
< nelt
/ 2; ++i
)
5444 for (i
= 0; i
< nelt
/ 2; ++i
)
5445 sel
[nelt
/ 2 + i
] = i
* 2;
5446 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5448 if (dump_enabled_p ())
5449 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5450 "shuffle of 2 fields structure is not \
5451 supported by target\n");
5454 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, sel
);
5456 /* Generating permutation constant to shift all elements.
5457 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5458 for (i
= 0; i
< nelt
; i
++)
5459 sel
[i
] = nelt
/ 2 + i
;
5460 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5462 if (dump_enabled_p ())
5463 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5464 "shift permutation is not supported by target\n");
5467 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5469 /* Generating permutation constant to select vector from 2.
5470 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5471 for (i
= 0; i
< nelt
/ 2; i
++)
5473 for (i
= nelt
/ 2; i
< nelt
; i
++)
5475 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5477 if (dump_enabled_p ())
5478 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5479 "select is not supported by target\n");
5482 select_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5484 for (i
= 0; i
< log_length
; i
++)
5486 for (j
= 0; j
< length
; j
+= 2)
5488 first_vect
= dr_chain
[j
];
5489 second_vect
= dr_chain
[j
+ 1];
5491 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5492 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5493 first_vect
, first_vect
,
5495 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5498 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
5499 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5500 second_vect
, second_vect
,
5502 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5505 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
5506 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5507 vect
[0], vect
[1], shift1_mask
);
5508 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5509 (*result_chain
)[j
/2 + length
/2] = data_ref
;
5511 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
5512 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5513 vect
[0], vect
[1], select_mask
);
5514 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5515 (*result_chain
)[j
/2] = data_ref
;
5517 memcpy (dr_chain
.address (), result_chain
->address (),
5518 length
* sizeof (tree
));
5522 if (length
== 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo
) > 2)
5524 unsigned int k
= 0, l
= 0;
5526 /* Generating permutation constant to get all elements in rigth order.
5527 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5528 for (i
= 0; i
< nelt
; i
++)
5530 if (3 * k
+ (l
% 3) >= nelt
)
5533 l
+= (3 - (nelt
% 3));
5535 sel
[i
] = 3 * k
+ (l
% 3);
5538 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5540 if (dump_enabled_p ())
5541 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5542 "shuffle of 3 fields structure is not \
5543 supported by target\n");
5546 perm3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5548 /* Generating permutation constant to shift all elements.
5549 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5550 for (i
= 0; i
< nelt
; i
++)
5551 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
5552 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5554 if (dump_enabled_p ())
5555 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5556 "shift permutation is not supported by target\n");
5559 shift1_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5561 /* Generating permutation constant to shift all elements.
5562 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5563 for (i
= 0; i
< nelt
; i
++)
5564 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
5565 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5567 if (dump_enabled_p ())
5568 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5569 "shift permutation is not supported by target\n");
5572 shift2_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5574 /* Generating permutation constant to shift all elements.
5575 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5576 for (i
= 0; i
< nelt
; i
++)
5577 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5578 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5580 if (dump_enabled_p ())
5581 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5582 "shift permutation is not supported by target\n");
5585 shift3_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5587 /* Generating permutation constant to shift all elements.
5588 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5589 for (i
= 0; i
< nelt
; i
++)
5590 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
5591 if (!can_vec_perm_p (TYPE_MODE (vectype
), false, sel
))
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5595 "shift permutation is not supported by target\n");
5598 shift4_mask
= vect_gen_perm_mask_checked (vectype
, sel
);
5600 for (k
= 0; k
< 3; k
++)
5602 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
5603 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5604 dr_chain
[k
], dr_chain
[k
],
5606 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5610 for (k
= 0; k
< 3; k
++)
5612 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
5613 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5614 vect
[k
% 3], vect
[(k
+ 1) % 3],
5616 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5617 vect_shift
[k
] = data_ref
;
5620 for (k
= 0; k
< 3; k
++)
5622 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
5623 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
5624 vect_shift
[(4 - k
) % 3],
5625 vect_shift
[(3 - k
) % 3],
5627 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5631 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
5633 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
5634 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
5635 vect
[0], shift3_mask
);
5636 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5637 (*result_chain
)[nelt
% 3] = data_ref
;
5639 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
5640 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
5641 vect
[1], shift4_mask
);
5642 vect_finish_stmt_generation (stmt
, perm_stmt
, gsi
);
5643 (*result_chain
)[0] = data_ref
;
5649 /* Function vect_transform_grouped_load.
5651 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5652 to perform their permutation and ascribe the result vectorized statements to
5653 the scalar statements.
5657 vect_transform_grouped_load (gimple
*stmt
, vec
<tree
> dr_chain
, int size
,
5658 gimple_stmt_iterator
*gsi
)
5661 vec
<tree
> result_chain
= vNULL
;
5663 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5664 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5665 vectors, that are ready for vector computation. */
5666 result_chain
.create (size
);
5668 /* If reassociation width for vector type is 2 or greater target machine can
5669 execute 2 or more vector instructions in parallel. Otherwise try to
5670 get chain for loads group using vect_shift_permute_load_chain. */
5671 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt
)));
5672 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
5673 || exact_log2 (size
) != -1
5674 || !vect_shift_permute_load_chain (dr_chain
, size
, stmt
,
5675 gsi
, &result_chain
))
5676 vect_permute_load_chain (dr_chain
, size
, stmt
, gsi
, &result_chain
);
5677 vect_record_grouped_load_vectors (stmt
, result_chain
);
5678 result_chain
.release ();
5681 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5682 generated as part of the vectorization of STMT. Assign the statement
5683 for each vector to the associated scalar statement. */
5686 vect_record_grouped_load_vectors (gimple
*stmt
, vec
<tree
> result_chain
)
5688 gimple
*first_stmt
= GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt
));
5689 gimple
*next_stmt
, *new_stmt
;
5690 unsigned int i
, gap_count
;
5693 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5694 Since we scan the chain starting from it's first node, their order
5695 corresponds the order of data-refs in RESULT_CHAIN. */
5696 next_stmt
= first_stmt
;
5698 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
5703 /* Skip the gaps. Loads created for the gaps will be removed by dead
5704 code elimination pass later. No need to check for the first stmt in
5705 the group, since it always exists.
5706 GROUP_GAP is the number of steps in elements from the previous
5707 access (if there is no gap GROUP_GAP is 1). We skip loads that
5708 correspond to the gaps. */
5709 if (next_stmt
!= first_stmt
5710 && gap_count
< GROUP_GAP (vinfo_for_stmt (next_stmt
)))
5718 new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
5719 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5720 copies, and we put the new vector statement in the first available
5722 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)))
5723 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
)) = new_stmt
;
5726 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5729 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt
));
5731 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
));
5734 prev_stmt
= rel_stmt
;
5736 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt
));
5739 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt
)) =
5744 next_stmt
= GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt
));
5746 /* If NEXT_STMT accesses the same DR as the previous statement,
5747 put the same TMP_DATA_REF as its vectorized statement; otherwise
5748 get the next data-ref from RESULT_CHAIN. */
5749 if (!next_stmt
|| !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt
)))
5755 /* Function vect_force_dr_alignment_p.
5757 Returns whether the alignment of a DECL can be forced to be aligned
5758 on ALIGNMENT bit boundary. */
5761 vect_can_force_dr_alignment_p (const_tree decl
, unsigned int alignment
)
5763 if (TREE_CODE (decl
) != VAR_DECL
)
5766 if (decl_in_symtab_p (decl
)
5767 && !symtab_node::get (decl
)->can_increase_alignment_p ())
5770 if (TREE_STATIC (decl
))
5771 return (alignment
<= MAX_OFILE_ALIGNMENT
);
5773 return (alignment
<= MAX_STACK_ALIGNMENT
);
5777 /* Return whether the data reference DR is supported with respect to its
5779 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5780 it is aligned, i.e., check if it is possible to vectorize it with different
5783 enum dr_alignment_support
5784 vect_supportable_dr_alignment (struct data_reference
*dr
,
5785 bool check_aligned_accesses
)
5787 gimple
*stmt
= DR_STMT (dr
);
5788 stmt_vec_info stmt_info
= vinfo_for_stmt (stmt
);
5789 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5790 machine_mode mode
= TYPE_MODE (vectype
);
5791 loop_vec_info loop_vinfo
= STMT_VINFO_LOOP_VINFO (stmt_info
);
5792 struct loop
*vect_loop
= NULL
;
5793 bool nested_in_vect_loop
= false;
5795 if (aligned_access_p (dr
) && !check_aligned_accesses
)
5798 /* For now assume all conditional loads/stores support unaligned
5799 access without any special code. */
5800 if (is_gimple_call (stmt
)
5801 && gimple_call_internal_p (stmt
)
5802 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
5803 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
5804 return dr_unaligned_supported
;
5808 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5809 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt
);
5812 /* Possibly unaligned access. */
5814 /* We can choose between using the implicit realignment scheme (generating
5815 a misaligned_move stmt) and the explicit realignment scheme (generating
5816 aligned loads with a REALIGN_LOAD). There are two variants to the
5817 explicit realignment scheme: optimized, and unoptimized.
5818 We can optimize the realignment only if the step between consecutive
5819 vector loads is equal to the vector size. Since the vector memory
5820 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5821 is guaranteed that the misalignment amount remains the same throughout the
5822 execution of the vectorized loop. Therefore, we can create the
5823 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5824 at the loop preheader.
5826 However, in the case of outer-loop vectorization, when vectorizing a
5827 memory access in the inner-loop nested within the LOOP that is now being
5828 vectorized, while it is guaranteed that the misalignment of the
5829 vectorized memory access will remain the same in different outer-loop
5830 iterations, it is *not* guaranteed that is will remain the same throughout
5831 the execution of the inner-loop. This is because the inner-loop advances
5832 with the original scalar step (and not in steps of VS). If the inner-loop
5833 step happens to be a multiple of VS, then the misalignment remains fixed
5834 and we can use the optimized realignment scheme. For example:
5840 When vectorizing the i-loop in the above example, the step between
5841 consecutive vector loads is 1, and so the misalignment does not remain
5842 fixed across the execution of the inner-loop, and the realignment cannot
5843 be optimized (as illustrated in the following pseudo vectorized loop):
5845 for (i=0; i<N; i+=4)
5846 for (j=0; j<M; j++){
5847 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5848 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5849 // (assuming that we start from an aligned address).
5852 We therefore have to use the unoptimized realignment scheme:
5854 for (i=0; i<N; i+=4)
5855 for (j=k; j<M; j+=4)
5856 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5857 // that the misalignment of the initial address is
5860 The loop can then be vectorized as follows:
5862 for (k=0; k<4; k++){
5863 rt = get_realignment_token (&vp[k]);
5864 for (i=0; i<N; i+=4){
5866 for (j=k; j<M; j+=4){
5868 va = REALIGN_LOAD <v1,v2,rt>;
5875 if (DR_IS_READ (dr
))
5877 bool is_packed
= false;
5878 tree type
= (TREE_TYPE (DR_REF (dr
)));
5880 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
5881 && (!targetm
.vectorize
.builtin_mask_for_load
5882 || targetm
.vectorize
.builtin_mask_for_load ()))
5884 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5885 if ((nested_in_vect_loop
5886 && (TREE_INT_CST_LOW (DR_STEP (dr
))
5887 != GET_MODE_SIZE (TYPE_MODE (vectype
))))
5889 return dr_explicit_realign
;
5891 return dr_explicit_realign_optimized
;
5893 if (!known_alignment_for_access_p (dr
))
5894 is_packed
= not_size_aligned (DR_REF (dr
));
5896 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5897 || targetm
.vectorize
.
5898 support_vector_misalignment (mode
, type
,
5899 DR_MISALIGNMENT (dr
), is_packed
))
5900 /* Can't software pipeline the loads, but can at least do them. */
5901 return dr_unaligned_supported
;
5905 bool is_packed
= false;
5906 tree type
= (TREE_TYPE (DR_REF (dr
)));
5908 if (!known_alignment_for_access_p (dr
))
5909 is_packed
= not_size_aligned (DR_REF (dr
));
5911 if ((TYPE_USER_ALIGN (type
) && !is_packed
)
5912 || targetm
.vectorize
.
5913 support_vector_misalignment (mode
, type
,
5914 DR_MISALIGNMENT (dr
), is_packed
))
5915 return dr_unaligned_supported
;
5919 return dr_unaligned_unsupported
;