Add -march=gfx906 for AMD GCN.
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 {
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
147 }
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 {
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
152 {
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
155 /* gimple_expr_type already picked the type of the loaded
156 or stored data. */
157 i = ~0U;
158 else if (internal_fn_mask_index (ifn) == 0)
159 i = 1;
160 }
161 if (i < gimple_call_num_args (call))
162 {
163 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
164 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 {
166 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
167 if (rhs < lhs)
168 scalar_type = rhs_type;
169 }
170 }
171 }
172
173 *lhs_size_unit = lhs;
174 *rhs_size_unit = rhs;
175 return scalar_type;
176 }
177
178
179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
180 tested at run-time. Return TRUE if DDR was successfully inserted.
181 Return false if versioning is not supported. */
182
183 static opt_result
184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 {
186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187
188 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
189 return opt_result::failure_at (vect_location,
190 "will not create alias checks, as"
191 " --param vect-max-version-for-alias-checks"
192 " == 0\n");
193
194 opt_result res
195 = runtime_alias_check_p (ddr, loop,
196 optimize_loop_nest_for_speed_p (loop));
197 if (!res)
198 return res;
199
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return opt_result::success ();
202 }
203
204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205
206 static void
207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 {
209 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
210 for (unsigned int i = 0; i < checks.length(); ++i)
211 if (checks[i] == value)
212 return;
213
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "need run-time check that %T is nonzero\n",
217 value);
218 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
219 }
220
221 /* Return true if we know that the order of vectorized DR_INFO_A and
222 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
223 DR_INFO_B. At least one of the accesses is a write. */
224
225 static bool
226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 {
228 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
229 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230
231 /* Single statements are always kept in their original order. */
232 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
233 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
234 return true;
235
236 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
237 SLP group are emitted at the position of the last scalar load and
238 all loads in an interleaving group are emitted at the position
239 of the first scalar load.
240 Stores in a group are emitted at the position of the last scalar store.
241 Compute that position and check whether the resulting order matches
242 the current one.
243 We have not yet decided between SLP and interleaving so we have
244 to conservatively assume both. */
245 stmt_vec_info il_a;
246 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
247 if (last_a)
248 {
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 last_a = get_later_stmt (last_a, s);
252 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
253 {
254 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
255 s = DR_GROUP_NEXT_ELEMENT (s))
256 if (get_later_stmt (il_a, s) == il_a)
257 il_a = s;
258 }
259 else
260 il_a = last_a;
261 }
262 else
263 last_a = il_a = stmtinfo_a;
264 stmt_vec_info il_b;
265 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
266 if (last_b)
267 {
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 last_b = get_later_stmt (last_b, s);
271 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
272 {
273 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
274 s = DR_GROUP_NEXT_ELEMENT (s))
275 if (get_later_stmt (il_b, s) == il_b)
276 il_b = s;
277 }
278 else
279 il_b = last_b;
280 }
281 else
282 last_b = il_b = stmtinfo_b;
283 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
284 return (/* SLP */
285 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
286 /* Interleaving */
287 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
288 /* Mixed */
289 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
290 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
291 }
292
293 /* A subroutine of vect_analyze_data_ref_dependence. Handle
294 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
295 distances. These distances are conservatively correct but they don't
296 reflect a guaranteed dependence.
297
298 Return true if this function does all the work necessary to avoid
299 an alias or false if the caller should use the dependence distances
300 to limit the vectorization factor in the usual way. LOOP_DEPTH is
301 the depth of the loop described by LOOP_VINFO and the other arguments
302 are as for vect_analyze_data_ref_dependence. */
303
304 static bool
305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
306 loop_vec_info loop_vinfo,
307 int loop_depth, unsigned int *max_vf)
308 {
309 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
310 lambda_vector dist_v;
311 unsigned int i;
312 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
313 {
314 int dist = dist_v[loop_depth];
315 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
316 {
317 /* If the user asserted safelen >= DIST consecutive iterations
318 can be executed concurrently, assume independence.
319
320 ??? An alternative would be to add the alias check even
321 in this case, and vectorize the fallback loop with the
322 maximum VF set to safelen. However, if the user has
323 explicitly given a length, it's less likely that that
324 would be a win. */
325 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
326 {
327 if ((unsigned int) loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 continue;
331 }
332
333 /* For dependence distances of 2 or more, we have the option
334 of limiting VF or checking for an alias at runtime.
335 Prefer to check at runtime if we can, to avoid limiting
336 the VF unnecessarily when the bases are in fact independent.
337
338 Note that the alias checks will be removed if the VF ends up
339 being small enough. */
340 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
341 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
342 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
343 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
344 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
345 }
346 }
347 return true;
348 }
349
350
351 /* Function vect_analyze_data_ref_dependence.
352
353 FIXME: I needed to change the sense of the returned flag.
354
355 Return FALSE if there (might) exist a dependence between a memory-reference
356 DRA and a memory-reference DRB. When versioning for alias may check a
357 dependence at run-time, return TRUE. Adjust *MAX_VF according to
358 the data dependence. */
359
360 static opt_result
361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
362 loop_vec_info loop_vinfo,
363 unsigned int *max_vf)
364 {
365 unsigned int i;
366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
367 struct data_reference *dra = DDR_A (ddr);
368 struct data_reference *drb = DDR_B (ddr);
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
371 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
372 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
373 lambda_vector dist_v;
374 unsigned int loop_depth;
375
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379 gcc_unreachable ();
380
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383 return opt_result::success ();
384
385 if (dra == drb
386 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387 return opt_result::success ();
388
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
391 group size. */
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396 return opt_result::success ();
397
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 get_alias_set (DR_REF (drb))))
408 return opt_result::success ();
409
410 /* Unknown data dependence. */
411 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
412 {
413 /* If user asserted safelen consecutive iterations can be
414 executed concurrently, assume independence. */
415 if (loop->safelen >= 2)
416 {
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return opt_result::success ();
421 }
422
423 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
424 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
425 return opt_result::failure_at
426 (stmtinfo_a->stmt,
427 "versioning for alias not supported for: "
428 "can't determine dependence between %T and %T\n",
429 DR_REF (dra), DR_REF (drb));
430
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
433 "versioning for alias required: "
434 "can't determine dependence between %T and %T\n",
435 DR_REF (dra), DR_REF (drb));
436
437 /* Add to list of ddrs that need to be tested at run-time. */
438 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
439 }
440
441 /* Known data dependence. */
442 if (DDR_NUM_DIST_VECTS (ddr) == 0)
443 {
444 /* If user asserted safelen consecutive iterations can be
445 executed concurrently, assume independence. */
446 if (loop->safelen >= 2)
447 {
448 if ((unsigned int) loop->safelen < *max_vf)
449 *max_vf = loop->safelen;
450 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
451 return opt_result::success ();
452 }
453
454 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
455 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 return opt_result::failure_at
457 (stmtinfo_a->stmt,
458 "versioning for alias not supported for: "
459 "bad dist vector for %T and %T\n",
460 DR_REF (dra), DR_REF (drb));
461
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
464 "versioning for alias required: "
465 "bad dist vector for %T and %T\n",
466 DR_REF (dra), DR_REF (drb));
467 /* Add to list of ddrs that need to be tested at run-time. */
468 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
469 }
470
471 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
472
473 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
474 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
475 loop_depth, max_vf))
476 return opt_result::success ();
477
478 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
479 {
480 int dist = dist_v[loop_depth];
481
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "dependence distance = %d.\n", dist);
485
486 if (dist == 0)
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "dependence distance == 0 between %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
492
493 /* When we perform grouped accesses and perform implicit CSE
494 by detecting equal accesses and doing disambiguation with
495 runtime alias tests like for
496 .. = a[i];
497 .. = a[i+1];
498 a[i] = ..;
499 a[i+1] = ..;
500 *p = ..;
501 .. = a[i];
502 .. = a[i+1];
503 where we will end up loading { a[i], a[i+1] } once, make
504 sure that inserting group loads before the first load and
505 stores after the last store will do the right thing.
506 Similar for groups like
507 a[i] = ...;
508 ... = a[i];
509 a[i+1] = ...;
510 where loads from the group interleave with the store. */
511 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
512 return opt_result::failure_at (stmtinfo_a->stmt,
513 "READ_WRITE dependence"
514 " in interleaving.\n");
515
516 if (loop->safelen < 2)
517 {
518 tree indicator = dr_zero_step_indicator (dra);
519 if (!indicator || integer_zerop (indicator))
520 return opt_result::failure_at (stmtinfo_a->stmt,
521 "access also has a zero step\n");
522 else if (TREE_CODE (indicator) != INTEGER_CST)
523 vect_check_nonzero_value (loop_vinfo, indicator);
524 }
525 continue;
526 }
527
528 if (dist > 0 && DDR_REVERSED_P (ddr))
529 {
530 /* If DDR_REVERSED_P the order of the data-refs in DDR was
531 reversed (to make distance vector positive), and the actual
532 distance is negative. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance negative.\n");
536 /* When doing outer loop vectorization, we need to check if there is
537 a backward dependence at the inner loop level if the dependence
538 at the outer loop is reversed. See PR81740. */
539 if (nested_in_vect_loop_p (loop, stmtinfo_a)
540 || nested_in_vect_loop_p (loop, stmtinfo_b))
541 {
542 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
543 DDR_LOOP_NEST (ddr));
544 if (dist_v[inner_depth] < 0)
545 return opt_result::failure_at (stmtinfo_a->stmt,
546 "not vectorized, dependence "
547 "between data-refs %T and %T\n",
548 DR_REF (dra), DR_REF (drb));
549 }
550 /* Record a negative dependence distance to later limit the
551 amount of stmt copying / unrolling we can perform.
552 Only need to handle read-after-write dependence. */
553 if (DR_IS_READ (drb)
554 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
555 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
556 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
557 continue;
558 }
559
560 unsigned int abs_dist = abs (dist);
561 if (abs_dist >= 2 && abs_dist < *max_vf)
562 {
563 /* The dependence distance requires reduction of the maximal
564 vectorization factor. */
565 *max_vf = abs_dist;
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "adjusting maximal vectorization factor to %i\n",
569 *max_vf);
570 }
571
572 if (abs_dist >= *max_vf)
573 {
574 /* Dependence distance does not create dependence, as far as
575 vectorization is concerned, in this case. */
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE, vect_location,
578 "dependence distance >= VF.\n");
579 continue;
580 }
581
582 return opt_result::failure_at (stmtinfo_a->stmt,
583 "not vectorized, possible dependence "
584 "between data-refs %T and %T\n",
585 DR_REF (dra), DR_REF (drb));
586 }
587
588 return opt_result::success ();
589 }
590
591 /* Function vect_analyze_data_ref_dependences.
592
593 Examine all the data references in the loop, and make sure there do not
594 exist any data dependences between them. Set *MAX_VF according to
595 the maximum vectorization factor the data dependences allow. */
596
597 opt_result
598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
599 unsigned int *max_vf)
600 {
601 unsigned int i;
602 struct data_dependence_relation *ddr;
603
604 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
605
606 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
607 {
608 LOOP_VINFO_DDRS (loop_vinfo)
609 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
610 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
611 /* We need read-read dependences to compute
612 STMT_VINFO_SAME_ALIGN_REFS. */
613 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
614 &LOOP_VINFO_DDRS (loop_vinfo),
615 LOOP_VINFO_LOOP_NEST (loop_vinfo),
616 true);
617 gcc_assert (res);
618 }
619
620 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
621
622 /* For epilogues we either have no aliases or alias versioning
623 was applied to original loop. Therefore we may just get max_vf
624 using VF of original loop. */
625 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
626 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
627 else
628 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
629 {
630 opt_result res
631 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
632 if (!res)
633 return res;
634 }
635
636 return opt_result::success ();
637 }
638
639
640 /* Function vect_slp_analyze_data_ref_dependence.
641
642 Return TRUE if there (might) exist a dependence between a memory-reference
643 DRA and a memory-reference DRB for VINFO. When versioning for alias
644 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
645 according to the data dependence. */
646
647 static bool
648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
649 struct data_dependence_relation *ddr)
650 {
651 struct data_reference *dra = DDR_A (ddr);
652 struct data_reference *drb = DDR_B (ddr);
653 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
654 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
655
656 /* We need to check dependences of statements marked as unvectorizable
657 as well, they still can prohibit vectorization. */
658
659 /* Independent data accesses. */
660 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
661 return false;
662
663 if (dra == drb)
664 return false;
665
666 /* Read-read is OK. */
667 if (DR_IS_READ (dra) && DR_IS_READ (drb))
668 return false;
669
670 /* If dra and drb are part of the same interleaving chain consider
671 them independent. */
672 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
673 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
674 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
675 return false;
676
677 /* Unknown data dependence. */
678 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "can't determine dependence between %T and %T\n",
683 DR_REF (dra), DR_REF (drb));
684 }
685 else if (dump_enabled_p ())
686 dump_printf_loc (MSG_NOTE, vect_location,
687 "determined dependence between %T and %T\n",
688 DR_REF (dra), DR_REF (drb));
689
690 return true;
691 }
692
693
694 /* Analyze dependences involved in the transform of SLP NODE. STORES
695 contain the vector of scalar stores of this instance if we are
696 disambiguating the loads. */
697
698 static bool
699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700 vec<stmt_vec_info> stores,
701 stmt_vec_info last_store_info)
702 {
703 /* This walks over all stmts involved in the SLP load/store done
704 in NODE verifying we can sink them up to the last stmt in the
705 group. */
706 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
707 vec_info *vinfo = last_access_info->vinfo;
708 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
709 {
710 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
711 if (access_info == last_access_info)
712 continue;
713 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
714 ao_ref ref;
715 bool ref_initialized_p = false;
716 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
717 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
718 {
719 gimple *stmt = gsi_stmt (gsi);
720 if (! gimple_vuse (stmt)
721 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
722 continue;
723
724 /* If we couldn't record a (single) data reference for this
725 stmt we have to resort to the alias oracle. */
726 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
727 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
728 if (!dr_b)
729 {
730 /* We are moving a store or sinking a load - this means
731 we cannot use TBAA for disambiguation. */
732 if (!ref_initialized_p)
733 ao_ref_init (&ref, DR_REF (dr_a));
734 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
735 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
736 return false;
737 continue;
738 }
739
740 bool dependent = false;
741 /* If we run into a store of this same instance (we've just
742 marked those) then delay dependence checking until we run
743 into the last store because this is where it will have
744 been sunk to (and we verify if we can do that as well). */
745 if (gimple_visited_p (stmt))
746 {
747 if (stmt_info != last_store_info)
748 continue;
749 unsigned i;
750 stmt_vec_info store_info;
751 FOR_EACH_VEC_ELT (stores, i, store_info)
752 {
753 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
754 ddr_p ddr = initialize_data_dependence_relation
755 (dr_a, store_dr, vNULL);
756 dependent
757 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
758 free_dependence_relation (ddr);
759 if (dependent)
760 break;
761 }
762 }
763 else
764 {
765 ddr_p ddr = initialize_data_dependence_relation (dr_a,
766 dr_b, vNULL);
767 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
768 free_dependence_relation (ddr);
769 }
770 if (dependent)
771 return false;
772 }
773 }
774 return true;
775 }
776
777
778 /* Function vect_analyze_data_ref_dependences.
779
780 Examine all the data references in the basic-block, and make sure there
781 do not exist any data dependences between them. Set *MAX_VF according to
782 the maximum vectorization factor the data dependences allow. */
783
784 bool
785 vect_slp_analyze_instance_dependence (slp_instance instance)
786 {
787 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
788
789 /* The stores of this instance are at the root of the SLP tree. */
790 slp_tree store = SLP_INSTANCE_TREE (instance);
791 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
792 store = NULL;
793
794 /* Verify we can sink stores to the vectorized stmt insert location. */
795 stmt_vec_info last_store_info = NULL;
796 if (store)
797 {
798 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
799 return false;
800
801 /* Mark stores in this instance and remember the last one. */
802 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
803 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
804 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
805 }
806
807 bool res = true;
808
809 /* Verify we can sink loads to the vectorized stmt insert location,
810 special-casing stores of this instance. */
811 slp_tree load;
812 unsigned int i;
813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
814 if (! vect_slp_analyze_node_dependences (instance, load,
815 store
816 ? SLP_TREE_SCALAR_STMTS (store)
817 : vNULL, last_store_info))
818 {
819 res = false;
820 break;
821 }
822
823 /* Unset the visited flag. */
824 if (store)
825 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
826 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
827
828 return res;
829 }
830
831 /* Record the base alignment guarantee given by DRB, which occurs
832 in STMT_INFO. */
833
834 static void
835 vect_record_base_alignment (stmt_vec_info stmt_info,
836 innermost_loop_behavior *drb)
837 {
838 vec_info *vinfo = stmt_info->vinfo;
839 bool existed;
840 innermost_loop_behavior *&entry
841 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
842 if (!existed || entry->base_alignment < drb->base_alignment)
843 {
844 entry = drb;
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_NOTE, vect_location,
847 "recording new base alignment for %T\n"
848 " alignment: %d\n"
849 " misalignment: %d\n"
850 " based on: %G",
851 drb->base_address,
852 drb->base_alignment,
853 drb->base_misalignment,
854 stmt_info->stmt);
855 }
856 }
857
858 /* If the region we're going to vectorize is reached, all unconditional
859 data references occur at least once. We can therefore pool the base
860 alignment guarantees from each unconditional reference. Do this by
861 going through all the data references in VINFO and checking whether
862 the containing statement makes the reference unconditionally. If so,
863 record the alignment of the base address in VINFO so that it can be
864 used for all other references with the same base. */
865
866 void
867 vect_record_base_alignments (vec_info *vinfo)
868 {
869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
870 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
871 data_reference *dr;
872 unsigned int i;
873 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
874 {
875 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
876 stmt_vec_info stmt_info = dr_info->stmt;
877 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
878 && STMT_VINFO_VECTORIZABLE (stmt_info)
879 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
880 {
881 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
882
883 /* If DR is nested in the loop that is being vectorized, we can also
884 record the alignment of the base wrt the outer loop. */
885 if (loop && nested_in_vect_loop_p (loop, stmt_info))
886 vect_record_base_alignment
887 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
888 }
889 }
890 }
891
892 /* Return the target alignment for the vectorized form of DR_INFO. */
893
894 static poly_uint64
895 vect_calculate_target_alignment (dr_vec_info *dr_info)
896 {
897 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
898 return targetm.vectorize.preferred_vector_alignment (vectype);
899 }
900
901 /* Function vect_compute_data_ref_alignment
902
903 Compute the misalignment of the data reference DR_INFO.
904
905 Output:
906 1. DR_MISALIGNMENT (DR_INFO) is defined.
907
908 FOR NOW: No analysis is actually performed. Misalignment is calculated
909 only for trivial cases. TODO. */
910
911 static void
912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
913 {
914 stmt_vec_info stmt_info = dr_info->stmt;
915 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
917 struct loop *loop = NULL;
918 tree ref = DR_REF (dr_info->dr);
919 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
920
921 if (dump_enabled_p ())
922 dump_printf_loc (MSG_NOTE, vect_location,
923 "vect_compute_data_ref_alignment:\n");
924
925 if (loop_vinfo)
926 loop = LOOP_VINFO_LOOP (loop_vinfo);
927
928 /* Initialize misalignment to unknown. */
929 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
930
931 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
932 return;
933
934 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
935 bool step_preserves_misalignment_p;
936
937 poly_uint64 vector_alignment
938 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
939 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
940
941 unsigned HOST_WIDE_INT vect_align_c;
942 if (!vector_alignment.is_constant (&vect_align_c))
943 return;
944
945 /* No step for BB vectorization. */
946 if (!loop)
947 {
948 gcc_assert (integer_zerop (drb->step));
949 step_preserves_misalignment_p = true;
950 }
951
952 /* In case the dataref is in an inner-loop of the loop that is being
953 vectorized (LOOP), we use the base and misalignment information
954 relative to the outer-loop (LOOP). This is ok only if the misalignment
955 stays the same throughout the execution of the inner-loop, which is why
956 we have to check that the stride of the dataref in the inner-loop evenly
957 divides by the vector alignment. */
958 else if (nested_in_vect_loop_p (loop, stmt_info))
959 {
960 step_preserves_misalignment_p
961 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
962
963 if (dump_enabled_p ())
964 {
965 if (step_preserves_misalignment_p)
966 dump_printf_loc (MSG_NOTE, vect_location,
967 "inner step divides the vector alignment.\n");
968 else
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "inner step doesn't divide the vector"
971 " alignment.\n");
972 }
973 }
974
975 /* Similarly we can only use base and misalignment information relative to
976 an innermost loop if the misalignment stays the same throughout the
977 execution of the loop. As above, this is the case if the stride of
978 the dataref evenly divides by the alignment. */
979 else
980 {
981 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
982 step_preserves_misalignment_p
983 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
984
985 if (!step_preserves_misalignment_p && dump_enabled_p ())
986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 "step doesn't divide the vector alignment.\n");
988 }
989
990 unsigned int base_alignment = drb->base_alignment;
991 unsigned int base_misalignment = drb->base_misalignment;
992
993 /* Calculate the maximum of the pooled base address alignment and the
994 alignment that we can compute for DR itself. */
995 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
996 if (entry && base_alignment < (*entry)->base_alignment)
997 {
998 base_alignment = (*entry)->base_alignment;
999 base_misalignment = (*entry)->base_misalignment;
1000 }
1001
1002 if (drb->offset_alignment < vect_align_c
1003 || !step_preserves_misalignment_p
1004 /* We need to know whether the step wrt the vectorized loop is
1005 negative when computing the starting misalignment below. */
1006 || TREE_CODE (drb->step) != INTEGER_CST)
1007 {
1008 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "Unknown alignment for access: %T\n", ref);
1011 return;
1012 }
1013
1014 if (base_alignment < vect_align_c)
1015 {
1016 unsigned int max_alignment;
1017 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018 if (max_alignment < vect_align_c
1019 || !vect_can_force_dr_alignment_p (base,
1020 vect_align_c * BITS_PER_UNIT))
1021 {
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "can't force alignment of ref: %T\n", ref);
1025 return;
1026 }
1027
1028 /* Force the alignment of the decl.
1029 NOTE: This is the only change to the code we make during
1030 the analysis phase, before deciding to vectorize the loop. */
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "force alignment of %T\n", ref);
1034
1035 dr_info->base_decl = base;
1036 dr_info->base_misaligned = true;
1037 base_misalignment = 0;
1038 }
1039 poly_int64 misalignment
1040 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042 /* If this is a backward running DR then first access in the larger
1043 vectype actually is N-1 elements before the address in the DR.
1044 Adjust misalign accordingly. */
1045 if (tree_int_cst_sgn (drb->step) < 0)
1046 /* PLUS because STEP is negative. */
1047 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048 * TREE_INT_CST_LOW (drb->step));
1049
1050 unsigned int const_misalignment;
1051 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052 {
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "Non-constant misalignment for access: %T\n", ref);
1056 return;
1057 }
1058
1059 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "misalign = %d bytes of ref %T\n",
1064 DR_MISALIGNMENT (dr_info), ref);
1065
1066 return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070 Sets DR_INFO's misalignment
1071 - to 0 if it has the same alignment as DR_PEEL_INFO,
1072 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073 - to -1 (unknown) otherwise.
1074
1075 DR_INFO - the data reference whose misalignment is to be adjusted.
1076 DR_PEEL_INFO - the data reference whose misalignment is being made
1077 zero in the vector loop by the peel.
1078 NPEEL - the number of iterations in the peel loop if the misalignment
1079 of DR_PEEL_INFO is known at compile time. */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083 dr_vec_info *dr_peel_info, int npeel)
1084 {
1085 unsigned int i;
1086 vec<dr_p> same_aligned_drs;
1087 struct data_reference *current_dr;
1088 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091 it is aligned in the vector loop. */
1092 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094 {
1095 if (current_dr != dr_info->dr)
1096 continue;
1097 gcc_assert (!known_alignment_for_access_p (dr_info)
1098 || !known_alignment_for_access_p (dr_peel_info)
1099 || (DR_MISALIGNMENT (dr_info)
1100 == DR_MISALIGNMENT (dr_peel_info)));
1101 SET_DR_MISALIGNMENT (dr_info, 0);
1102 return;
1103 }
1104
1105 unsigned HOST_WIDE_INT alignment;
1106 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107 && known_alignment_for_access_p (dr_info)
1108 && known_alignment_for_access_p (dr_peel_info))
1109 {
1110 int misal = DR_MISALIGNMENT (dr_info);
1111 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112 misal &= alignment - 1;
1113 SET_DR_MISALIGNMENT (dr_info, misal);
1114 return;
1115 }
1116
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119 "to unknown (-1).\n");
1120 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126 Return TRUE if DR_INFO can be handled with respect to alignment. */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131 enum dr_alignment_support supportable_dr_alignment
1132 = vect_supportable_dr_alignment (dr_info, false);
1133 if (!supportable_dr_alignment)
1134 return opt_result::failure_at
1135 (dr_info->stmt->stmt,
1136 DR_IS_READ (dr_info->dr)
1137 ? "not vectorized: unsupported unaligned load: %T\n"
1138 : "not vectorized: unsupported unaligned store: %T\n",
1139 DR_REF (dr_info->dr));
1140
1141 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE, vect_location,
1143 "Vectorizing an unaligned access.\n");
1144
1145 return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150 Return TRUE if all data references in the loop can be
1151 handled with respect to alignment. */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157 struct data_reference *dr;
1158 unsigned int i;
1159
1160 FOR_EACH_VEC_ELT (datarefs, i, dr)
1161 {
1162 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163 stmt_vec_info stmt_info = dr_info->stmt;
1164
1165 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166 continue;
1167
1168 /* For interleaving, only the alignment of the first access matters. */
1169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171 continue;
1172
1173 /* Strided accesses perform only component accesses, alignment is
1174 irrelevant for them. */
1175 if (STMT_VINFO_STRIDED_P (stmt_info)
1176 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177 continue;
1178
1179 opt_result res = verify_data_ref_alignment (dr_info);
1180 if (!res)
1181 return res;
1182 }
1183
1184 return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188 than its size. */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194 return true;
1195
1196 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197 > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202 Return true if vector alignment for DR_INFO is reachable by peeling
1203 a few loop iterations. Return false otherwise. */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208 stmt_vec_info stmt_info = dr_info->stmt;
1209 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212 {
1213 /* For interleaved access we peel only if number of iterations in
1214 the prolog loop ({VF - misalignment}), is a multiple of the
1215 number of the interleaved accesses. */
1216 int elem_size, mis_in_elements;
1217
1218 /* FORNOW: handle only known alignment. */
1219 if (!known_alignment_for_access_p (dr_info))
1220 return false;
1221
1222 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224 elem_size = vector_element_size (vector_size, nelements);
1225 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228 return false;
1229 }
1230
1231 /* If misalignment is known at the compile time then allow peeling
1232 only if natural alignment is reachable through peeling. */
1233 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234 {
1235 HOST_WIDE_INT elmsize =
1236 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237 if (dump_enabled_p ())
1238 {
1239 dump_printf_loc (MSG_NOTE, vect_location,
1240 "data size = %wd. misalignment = %d.\n", elmsize,
1241 DR_MISALIGNMENT (dr_info));
1242 }
1243 if (DR_MISALIGNMENT (dr_info) % elmsize)
1244 {
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 "data size does not divide the misalignment.\n");
1248 return false;
1249 }
1250 }
1251
1252 if (!known_alignment_for_access_p (dr_info))
1253 {
1254 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "Unknown misalignment, %snaturally aligned\n",
1259 is_packed ? "not " : "");
1260 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261 }
1262
1263 return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO. */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271 unsigned int *inside_cost,
1272 unsigned int *outside_cost,
1273 stmt_vector_for_cost *body_cost_vec,
1274 stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276 stmt_vec_info stmt_info = dr_info->stmt;
1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278 int ncopies;
1279
1280 if (PURE_SLP_STMT (stmt_info))
1281 ncopies = 1;
1282 else
1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285 if (DR_IS_READ (dr_info->dr))
1286 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287 prologue_cost_vec, body_cost_vec, false);
1288 else
1289 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "vect_get_data_access_cost: inside_cost = %d, "
1294 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300 dr_vec_info *dr_info;
1301 int npeel;
1302 unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307 struct _vect_peel_info peel_info;
1308 unsigned int inside_cost;
1309 unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers. */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317 static inline hashval_t hash (const _vect_peel_info *);
1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324 return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330 return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339 int npeel)
1340 {
1341 struct _vect_peel_info elem, *slot;
1342 _vect_peel_info **new_slot;
1343 bool supportable_dr_alignment
1344 = vect_supportable_dr_alignment (dr_info, true);
1345
1346 elem.npeel = npeel;
1347 slot = peeling_htab->find (&elem);
1348 if (slot)
1349 slot->count++;
1350 else
1351 {
1352 slot = XNEW (struct _vect_peel_info);
1353 slot->npeel = npeel;
1354 slot->dr_info = dr_info;
1355 slot->count = 1;
1356 new_slot = peeling_htab->find_slot (slot, INSERT);
1357 *new_slot = slot;
1358 }
1359
1360 if (!supportable_dr_alignment
1361 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362 slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367 number of data accesses. */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371 _vect_peel_extended_info *max)
1372 {
1373 vect_peel_info elem = *slot;
1374
1375 if (elem->count > max->peel_info.count
1376 || (elem->count == max->peel_info.count
1377 && max->peel_info.npeel > elem->npeel))
1378 {
1379 max->peel_info.npeel = elem->npeel;
1380 max->peel_info.count = elem->count;
1381 max->peel_info.dr_info = elem->dr_info;
1382 }
1383
1384 return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1389 we assume DR0_INFO's misalignment will be zero after peeling. */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393 dr_vec_info *dr0_info,
1394 unsigned int *inside_cost,
1395 unsigned int *outside_cost,
1396 stmt_vector_for_cost *body_cost_vec,
1397 stmt_vector_for_cost *prologue_cost_vec,
1398 unsigned int npeel,
1399 bool unknown_misalignment)
1400 {
1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402 unsigned i;
1403 data_reference *dr;
1404
1405 FOR_EACH_VEC_ELT (datarefs, i, dr)
1406 {
1407 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408 stmt_vec_info stmt_info = dr_info->stmt;
1409 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410 continue;
1411
1412 /* For interleaving, only the alignment of the first access
1413 matters. */
1414 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416 continue;
1417
1418 /* Strided accesses perform only component accesses, alignment is
1419 irrelevant for them. */
1420 if (STMT_VINFO_STRIDED_P (stmt_info)
1421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422 continue;
1423
1424 int save_misalignment;
1425 save_misalignment = DR_MISALIGNMENT (dr_info);
1426 if (npeel == 0)
1427 ;
1428 else if (unknown_misalignment && dr_info == dr0_info)
1429 SET_DR_MISALIGNMENT (dr_info, 0);
1430 else
1431 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433 body_cost_vec, prologue_cost_vec);
1434 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435 }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439 Find the one with the lowest cost. */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443 _vect_peel_extended_info *min)
1444 {
1445 vect_peel_info elem = *slot;
1446 int dummy;
1447 unsigned int inside_cost = 0, outside_cost = 0;
1448 stmt_vec_info stmt_info = elem->dr_info->stmt;
1449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451 epilogue_cost_vec;
1452
1453 prologue_cost_vec.create (2);
1454 body_cost_vec.create (2);
1455 epilogue_cost_vec.create (2);
1456
1457 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458 &outside_cost, &body_cost_vec,
1459 &prologue_cost_vec, elem->npeel, false);
1460
1461 body_cost_vec.release ();
1462
1463 outside_cost += vect_get_known_peeling_cost
1464 (loop_vinfo, elem->npeel, &dummy,
1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466 &prologue_cost_vec, &epilogue_cost_vec);
1467
1468 /* Prologue and epilogue costs are added to the target model later.
1469 These costs depend only on the scalar iteration cost, the
1470 number of peeling iterations finally chosen, and the number of
1471 misaligned statements. So discard the information found here. */
1472 prologue_cost_vec.release ();
1473 epilogue_cost_vec.release ();
1474
1475 if (inside_cost < min->inside_cost
1476 || (inside_cost == min->inside_cost
1477 && outside_cost < min->outside_cost))
1478 {
1479 min->inside_cost = inside_cost;
1480 min->outside_cost = outside_cost;
1481 min->peel_info.dr_info = elem->dr_info;
1482 min->peel_info.npeel = elem->npeel;
1483 min->peel_info.count = elem->count;
1484 }
1485
1486 return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491 choosing an option with the lowest cost (if cost model is enabled) or the
1492 option that aligns as many accesses as possible. */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 loop_vec_info loop_vinfo)
1497 {
1498 struct _vect_peel_extended_info res;
1499
1500 res.peel_info.dr_info = NULL;
1501
1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503 {
1504 res.inside_cost = INT_MAX;
1505 res.outside_cost = INT_MAX;
1506 peeling_htab->traverse <_vect_peel_extended_info *,
1507 vect_peeling_hash_get_lowest_cost> (&res);
1508 }
1509 else
1510 {
1511 res.peel_info.count = 0;
1512 peeling_htab->traverse <_vect_peel_extended_info *,
1513 vect_peeling_hash_get_most_frequent> (&res);
1514 res.inside_cost = 0;
1515 res.outside_cost = 0;
1516 }
1517
1518 return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported. */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525 unsigned npeel)
1526 {
1527 unsigned i;
1528 struct data_reference *dr = NULL;
1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530 enum dr_alignment_support supportable_dr_alignment;
1531
1532 /* Ensure that all data refs can be vectorized after the peel. */
1533 FOR_EACH_VEC_ELT (datarefs, i, dr)
1534 {
1535 int save_misalignment;
1536
1537 if (dr == dr0_info->dr)
1538 continue;
1539
1540 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541 stmt_vec_info stmt_info = dr_info->stmt;
1542 /* For interleaving, only the alignment of the first access
1543 matters. */
1544 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546 continue;
1547
1548 /* Strided accesses perform only component accesses, alignment is
1549 irrelevant for them. */
1550 if (STMT_VINFO_STRIDED_P (stmt_info)
1551 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 continue;
1553
1554 save_misalignment = DR_MISALIGNMENT (dr_info);
1555 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556 supportable_dr_alignment
1557 = vect_supportable_dr_alignment (dr_info, false);
1558 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560 if (!supportable_dr_alignment)
1561 return false;
1562 }
1563
1564 return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569 This pass will use loop versioning and loop peeling in order to enhance
1570 the alignment of data references in the loop.
1571
1572 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573 original loop is to be vectorized. Any other loops that are created by
1574 the transformations performed in this pass - are not supposed to be
1575 vectorized. This restriction will be relaxed.
1576
1577 This pass will require a cost model to guide it whether to apply peeling
1578 or versioning or a combination of the two. For example, the scheme that
1579 intel uses when given a loop with several memory accesses, is as follows:
1580 choose one memory access ('p') which alignment you want to force by doing
1581 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1582 other accesses are not necessarily aligned, or (2) use loop versioning to
1583 generate one loop in which all accesses are aligned, and another loop in
1584 which only 'p' is necessarily aligned.
1585
1586 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590 Devising a cost model is the most critical aspect of this work. It will
1591 guide us on which access to peel for, whether to use loop versioning, how
1592 many versions to create, etc. The cost model will probably consist of
1593 generic considerations as well as target specific considerations (on
1594 powerpc for example, misaligned stores are more painful than misaligned
1595 loads).
1596
1597 Here are the general steps involved in alignment enhancements:
1598
1599 -- original loop, before alignment analysis:
1600 for (i=0; i<N; i++){
1601 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1602 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1603 }
1604
1605 -- After vect_compute_data_refs_alignment:
1606 for (i=0; i<N; i++){
1607 x = q[i]; # DR_MISALIGNMENT(q) = 3
1608 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1609 }
1610
1611 -- Possibility 1: we do loop versioning:
1612 if (p is aligned) {
1613 for (i=0; i<N; i++){ # loop 1A
1614 x = q[i]; # DR_MISALIGNMENT(q) = 3
1615 p[i] = y; # DR_MISALIGNMENT(p) = 0
1616 }
1617 }
1618 else {
1619 for (i=0; i<N; i++){ # loop 1B
1620 x = q[i]; # DR_MISALIGNMENT(q) = 3
1621 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1622 }
1623 }
1624
1625 -- Possibility 2: we do loop peeling:
1626 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1627 x = q[i];
1628 p[i] = y;
1629 }
1630 for (i = 3; i < N; i++){ # loop 2A
1631 x = q[i]; # DR_MISALIGNMENT(q) = 0
1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1633 }
1634
1635 -- Possibility 3: combination of loop peeling and versioning:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1639 }
1640 if (p is aligned) {
1641 for (i = 3; i<N; i++){ # loop 3A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = 0
1644 }
1645 }
1646 else {
1647 for (i = 3; i<N; i++){ # loop 3B
1648 x = q[i]; # DR_MISALIGNMENT(q) = 0
1649 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1650 }
1651 }
1652
1653 These loops are later passed to loop_transform to be vectorized. The
1654 vectorizer will use the alignment information to guide the transformation
1655 (whether to generate regular loads/stores, or with special handling for
1656 misalignment). */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 enum dr_alignment_support supportable_dr_alignment;
1664 dr_vec_info *first_store = NULL;
1665 dr_vec_info *dr0_info = NULL;
1666 struct data_reference *dr;
1667 unsigned int i, j;
1668 bool do_peeling = false;
1669 bool do_versioning = false;
1670 unsigned int npeel = 0;
1671 bool one_misalignment_known = false;
1672 bool one_misalignment_unknown = false;
1673 bool one_dr_unsupportable = false;
1674 dr_vec_info *unsupportable_dr_info = NULL;
1675 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676 unsigned possible_npeel_number = 1;
1677 tree vectype;
1678 unsigned int mis, same_align_drs_max = 0;
1679 hash_table<peel_info_hasher> peeling_htab (1);
1680
1681 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683 /* Reset data so we can safely be called multiple times. */
1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687 /* While cost model enhancements are expected in the future, the high level
1688 view of the code at this time is as follows:
1689
1690 A) If there is a misaligned access then see if peeling to align
1691 this access can make all data references satisfy
1692 vect_supportable_dr_alignment. If so, update data structures
1693 as needed and return true.
1694
1695 B) If peeling wasn't possible and there is a data reference with an
1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697 then see if loop versioning checks can be used to make all data
1698 references satisfy vect_supportable_dr_alignment. If so, update
1699 data structures as needed and return true.
1700
1701 C) If neither peeling nor versioning were successful then return false if
1702 any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706 Note, Possibility 3 above (which is peeling and versioning together) is not
1707 being done at this time. */
1708
1709 /* (1) Peeling to force alignment. */
1710
1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712 Considerations:
1713 + How many accesses will become aligned due to the peeling
1714 - How many accesses will become unaligned due to the peeling,
1715 and the cost of misaligned accesses.
1716 - The cost of peeling (the extra runtime checks, the increase
1717 in code size). */
1718
1719 FOR_EACH_VEC_ELT (datarefs, i, dr)
1720 {
1721 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722 stmt_vec_info stmt_info = dr_info->stmt;
1723
1724 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 continue;
1726
1727 /* For interleaving, only the alignment of the first access
1728 matters. */
1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731 continue;
1732
1733 /* For scatter-gather or invariant accesses there is nothing
1734 to enhance. */
1735 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736 || integer_zerop (DR_STEP (dr)))
1737 continue;
1738
1739 /* Strided accesses perform only component accesses, alignment is
1740 irrelevant for them. */
1741 if (STMT_VINFO_STRIDED_P (stmt_info)
1742 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743 continue;
1744
1745 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746 do_peeling = vector_alignment_reachable_p (dr_info);
1747 if (do_peeling)
1748 {
1749 if (known_alignment_for_access_p (dr_info))
1750 {
1751 unsigned int npeel_tmp = 0;
1752 bool negative = tree_int_cst_compare (DR_STEP (dr),
1753 size_zero_node) < 0;
1754
1755 vectype = STMT_VINFO_VECTYPE (stmt_info);
1756 /* If known_alignment_for_access_p then we have set
1757 DR_MISALIGNMENT which is only done if we know it at compiler
1758 time, so it is safe to assume target alignment is constant.
1759 */
1760 unsigned int target_align =
1761 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763 mis = (negative
1764 ? DR_MISALIGNMENT (dr_info)
1765 : -DR_MISALIGNMENT (dr_info));
1766 if (DR_MISALIGNMENT (dr_info) != 0)
1767 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769 /* For multiple types, it is possible that the bigger type access
1770 will have more than one peeling option. E.g., a loop with two
1771 types: one of size (vector size / 4), and the other one of
1772 size (vector size / 8). Vectorization factor will 8. If both
1773 accesses are misaligned by 3, the first one needs one scalar
1774 iteration to be aligned, and the second one needs 5. But the
1775 first one will be aligned also by peeling 5 scalar
1776 iterations, and in that case both accesses will be aligned.
1777 Hence, except for the immediate peeling amount, we also want
1778 to try to add full vector size, while we don't exceed
1779 vectorization factor.
1780 We do this automatically for cost model, since we calculate
1781 cost for every peeling option. */
1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783 {
1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 possible_npeel_number
1787 = vect_get_num_vectors (nscalars, vectype);
1788
1789 /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 allow peeling NELEMENTS. */
1791 if (DR_MISALIGNMENT (dr_info) == 0)
1792 possible_npeel_number++;
1793 }
1794
1795 /* Save info about DR in the hash table. Also include peeling
1796 amounts according to the explanation above. */
1797 for (j = 0; j < possible_npeel_number; j++)
1798 {
1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 dr_info, npeel_tmp);
1801 npeel_tmp += target_align / dr_size;
1802 }
1803
1804 one_misalignment_known = true;
1805 }
1806 else
1807 {
1808 /* If we don't know any misalignment values, we prefer
1809 peeling for data-ref that has the maximum number of data-refs
1810 with the same alignment, unless the target prefers to align
1811 stores over load. */
1812 unsigned same_align_drs
1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 if (!dr0_info
1815 || same_align_drs_max < same_align_drs)
1816 {
1817 same_align_drs_max = same_align_drs;
1818 dr0_info = dr_info;
1819 }
1820 /* For data-refs with the same number of related
1821 accesses prefer the one where the misalign
1822 computation will be invariant in the outermost loop. */
1823 else if (same_align_drs_max == same_align_drs)
1824 {
1825 struct loop *ivloop0, *ivloop;
1826 ivloop0 = outermost_invariant_loop_for_expr
1827 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828 ivloop = outermost_invariant_loop_for_expr
1829 (loop, DR_BASE_ADDRESS (dr));
1830 if ((ivloop && !ivloop0)
1831 || (ivloop && ivloop0
1832 && flow_loop_nested_p (ivloop, ivloop0)))
1833 dr0_info = dr_info;
1834 }
1835
1836 one_misalignment_unknown = true;
1837
1838 /* Check for data refs with unsupportable alignment that
1839 can be peeled. */
1840 if (!supportable_dr_alignment)
1841 {
1842 one_dr_unsupportable = true;
1843 unsupportable_dr_info = dr_info;
1844 }
1845
1846 if (!first_store && DR_IS_WRITE (dr))
1847 first_store = dr_info;
1848 }
1849 }
1850 else
1851 {
1852 if (!aligned_access_p (dr_info))
1853 {
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "vector alignment may not be reachable\n");
1857 break;
1858 }
1859 }
1860 }
1861
1862 /* Check if we can possibly peel the loop. */
1863 if (!vect_can_advance_ivs_p (loop_vinfo)
1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865 || loop->inner)
1866 do_peeling = false;
1867
1868 struct _vect_peel_extended_info peel_for_known_alignment;
1869 struct _vect_peel_extended_info peel_for_unknown_alignment;
1870 struct _vect_peel_extended_info best_peel;
1871
1872 peel_for_unknown_alignment.inside_cost = INT_MAX;
1873 peel_for_unknown_alignment.outside_cost = INT_MAX;
1874 peel_for_unknown_alignment.peel_info.count = 0;
1875
1876 if (do_peeling
1877 && one_misalignment_unknown)
1878 {
1879 /* Check if the target requires to prefer stores over loads, i.e., if
1880 misaligned stores are more expensive than misaligned loads (taking
1881 drs with same alignment into account). */
1882 unsigned int load_inside_cost = 0;
1883 unsigned int load_outside_cost = 0;
1884 unsigned int store_inside_cost = 0;
1885 unsigned int store_outside_cost = 0;
1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888 stmt_vector_for_cost dummy;
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891 &load_inside_cost,
1892 &load_outside_cost,
1893 &dummy, &dummy, estimated_npeels, true);
1894 dummy.release ();
1895
1896 if (first_store)
1897 {
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900 &store_inside_cost,
1901 &store_outside_cost,
1902 &dummy, &dummy,
1903 estimated_npeels, true);
1904 dummy.release ();
1905 }
1906 else
1907 {
1908 store_inside_cost = INT_MAX;
1909 store_outside_cost = INT_MAX;
1910 }
1911
1912 if (load_inside_cost > store_inside_cost
1913 || (load_inside_cost == store_inside_cost
1914 && load_outside_cost > store_outside_cost))
1915 {
1916 dr0_info = first_store;
1917 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919 }
1920 else
1921 {
1922 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924 }
1925
1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927 prologue_cost_vec.create (2);
1928 epilogue_cost_vec.create (2);
1929
1930 int dummy2;
1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 (loop_vinfo, estimated_npeels, &dummy2,
1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 &prologue_cost_vec, &epilogue_cost_vec);
1935
1936 prologue_cost_vec.release ();
1937 epilogue_cost_vec.release ();
1938
1939 peel_for_unknown_alignment.peel_info.count = 1
1940 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941 }
1942
1943 peel_for_unknown_alignment.peel_info.npeel = 0;
1944 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946 best_peel = peel_for_unknown_alignment;
1947
1948 peel_for_known_alignment.inside_cost = INT_MAX;
1949 peel_for_known_alignment.outside_cost = INT_MAX;
1950 peel_for_known_alignment.peel_info.count = 0;
1951 peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953 if (do_peeling && one_misalignment_known)
1954 {
1955 /* Peeling is possible, but there is no data access that is not supported
1956 unless aligned. So we try to choose the best possible peeling from
1957 the hash table. */
1958 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959 (&peeling_htab, loop_vinfo);
1960 }
1961
1962 /* Compare costs of peeling for known and unknown alignment. */
1963 if (peel_for_known_alignment.peel_info.dr_info != NULL
1964 && peel_for_unknown_alignment.inside_cost
1965 >= peel_for_known_alignment.inside_cost)
1966 {
1967 best_peel = peel_for_known_alignment;
1968
1969 /* If the best peeling for known alignment has NPEEL == 0, perform no
1970 peeling at all except if there is an unsupportable dr that we can
1971 align. */
1972 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973 do_peeling = false;
1974 }
1975
1976 /* If there is an unsupportable data ref, prefer this over all choices so far
1977 since we'd have to discard a chosen peeling except when it accidentally
1978 aligned the unsupportable data ref. */
1979 if (one_dr_unsupportable)
1980 dr0_info = unsupportable_dr_info;
1981 else if (do_peeling)
1982 {
1983 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984 TODO: Use nopeel_outside_cost or get rid of it? */
1985 unsigned nopeel_inside_cost = 0;
1986 unsigned nopeel_outside_cost = 0;
1987
1988 stmt_vector_for_cost dummy;
1989 dummy.create (2);
1990 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991 &nopeel_outside_cost, &dummy, &dummy,
1992 0, false);
1993 dummy.release ();
1994
1995 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1996 costs will be recorded. */
1997 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998 prologue_cost_vec.create (2);
1999 epilogue_cost_vec.create (2);
2000
2001 int dummy2;
2002 nopeel_outside_cost += vect_get_known_peeling_cost
2003 (loop_vinfo, 0, &dummy2,
2004 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005 &prologue_cost_vec, &epilogue_cost_vec);
2006
2007 prologue_cost_vec.release ();
2008 epilogue_cost_vec.release ();
2009
2010 npeel = best_peel.peel_info.npeel;
2011 dr0_info = best_peel.peel_info.dr_info;
2012
2013 /* If no peeling is not more expensive than the best peeling we
2014 have so far, don't perform any peeling. */
2015 if (nopeel_inside_cost <= best_peel.inside_cost)
2016 do_peeling = false;
2017 }
2018
2019 if (do_peeling)
2020 {
2021 stmt_vec_info stmt_info = dr0_info->stmt;
2022 vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024 if (known_alignment_for_access_p (dr0_info))
2025 {
2026 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027 size_zero_node) < 0;
2028 if (!npeel)
2029 {
2030 /* Since it's known at compile time, compute the number of
2031 iterations in the peeled loop (the peeling factor) for use in
2032 updating DR_MISALIGNMENT values. The peeling factor is the
2033 vectorization factor minus the misalignment as an element
2034 count. */
2035 mis = (negative
2036 ? DR_MISALIGNMENT (dr0_info)
2037 : -DR_MISALIGNMENT (dr0_info));
2038 /* If known_alignment_for_access_p then we have set
2039 DR_MISALIGNMENT which is only done if we know it at compiler
2040 time, so it is safe to assume target alignment is constant.
2041 */
2042 unsigned int target_align =
2043 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044 npeel = ((mis & (target_align - 1))
2045 / vect_get_scalar_dr_size (dr0_info));
2046 }
2047
2048 /* For interleaved data access every iteration accesses all the
2049 members of the group, therefore we divide the number of iterations
2050 by the group size. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_NOTE, vect_location,
2056 "Try peeling by %d\n", npeel);
2057 }
2058
2059 /* Ensure that all datarefs can be vectorized after the peel. */
2060 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061 do_peeling = false;
2062
2063 /* Check if all datarefs are supportable and log. */
2064 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065 {
2066 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067 if (!stat)
2068 do_peeling = false;
2069 else
2070 return stat;
2071 }
2072
2073 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2074 if (do_peeling)
2075 {
2076 unsigned max_allowed_peel
2077 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078 if (max_allowed_peel != (unsigned)-1)
2079 {
2080 unsigned max_peel = npeel;
2081 if (max_peel == 0)
2082 {
2083 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084 unsigned HOST_WIDE_INT target_align_c;
2085 if (target_align.is_constant (&target_align_c))
2086 max_peel =
2087 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088 else
2089 {
2090 do_peeling = false;
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "Disable peeling, max peels set and vector"
2094 " alignment unknown\n");
2095 }
2096 }
2097 if (max_peel > max_allowed_peel)
2098 {
2099 do_peeling = false;
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "Disable peeling, max peels reached: %d\n", max_peel);
2103 }
2104 }
2105 }
2106
2107 /* Cost model #2 - if peeling may result in a remaining loop not
2108 iterating enough to be vectorized then do not peel. Since this
2109 is a cost heuristic rather than a correctness decision, use the
2110 most likely runtime value for variable vectorization factors. */
2111 if (do_peeling
2112 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2113 {
2114 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117 < assumed_vf + max_peel)
2118 do_peeling = false;
2119 }
2120
2121 if (do_peeling)
2122 {
2123 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124 If the misalignment of DR_i is identical to that of dr0 then set
2125 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2126 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127 by the peeling factor times the element size of DR_i (MOD the
2128 vectorization factor times the size). Otherwise, the
2129 misalignment of DR_i must be set to unknown. */
2130 FOR_EACH_VEC_ELT (datarefs, i, dr)
2131 if (dr != dr0_info->dr)
2132 {
2133 /* Strided accesses perform only component accesses, alignment
2134 is irrelevant for them. */
2135 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136 stmt_info = dr_info->stmt;
2137 if (STMT_VINFO_STRIDED_P (stmt_info)
2138 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139 continue;
2140
2141 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2142 }
2143
2144 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145 if (npeel)
2146 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147 else
2148 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149 = DR_MISALIGNMENT (dr0_info);
2150 SET_DR_MISALIGNMENT (dr0_info, 0);
2151 if (dump_enabled_p ())
2152 {
2153 dump_printf_loc (MSG_NOTE, vect_location,
2154 "Alignment of access forced using peeling.\n");
2155 dump_printf_loc (MSG_NOTE, vect_location,
2156 "Peeling for alignment will be applied.\n");
2157 }
2158
2159 /* The inside-loop cost will be accounted for in vectorizable_load
2160 and vectorizable_store correctly with adjusted alignments.
2161 Drop the body_cst_vec on the floor here. */
2162 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163 gcc_assert (stat);
2164 return stat;
2165 }
2166 }
2167
2168 /* (2) Versioning to force alignment. */
2169
2170 /* Try versioning if:
2171 1) optimize loop for speed
2172 2) there is at least one unsupported misaligned data ref with an unknown
2173 misalignment, and
2174 3) all misaligned data refs with a known misalignment are supported, and
2175 4) the number of runtime alignment checks is within reason. */
2176
2177 do_versioning =
2178 optimize_loop_nest_for_speed_p (loop)
2179 && (!loop->inner); /* FORNOW */
2180
2181 if (do_versioning)
2182 {
2183 FOR_EACH_VEC_ELT (datarefs, i, dr)
2184 {
2185 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186 stmt_vec_info stmt_info = dr_info->stmt;
2187
2188 /* For interleaving, only the alignment of the first access
2189 matters. */
2190 if (aligned_access_p (dr_info)
2191 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193 continue;
2194
2195 if (STMT_VINFO_STRIDED_P (stmt_info))
2196 {
2197 /* Strided loads perform only component accesses, alignment is
2198 irrelevant for them. */
2199 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200 continue;
2201 do_versioning = false;
2202 break;
2203 }
2204
2205 supportable_dr_alignment
2206 = vect_supportable_dr_alignment (dr_info, false);
2207
2208 if (!supportable_dr_alignment)
2209 {
2210 int mask;
2211 tree vectype;
2212
2213 if (known_alignment_for_access_p (dr_info)
2214 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2216 {
2217 do_versioning = false;
2218 break;
2219 }
2220
2221 vectype = STMT_VINFO_VECTYPE (stmt_info);
2222 gcc_assert (vectype);
2223
2224 /* At present we don't support versioning for alignment
2225 with variable VF, since there's no guarantee that the
2226 VF is a power of two. We could relax this if we added
2227 a way of enforcing a power-of-two size. */
2228 unsigned HOST_WIDE_INT size;
2229 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2230 {
2231 do_versioning = false;
2232 break;
2233 }
2234
2235 /* Forcing alignment in the first iteration is no good if
2236 we don't keep it across iterations. For now, just disable
2237 versioning in this case.
2238 ?? We could actually unroll the loop to achieve the required
2239 overall step alignment, and forcing the alignment could be
2240 done by doing some iterations of the non-vectorized loop. */
2241 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242 * DR_STEP_ALIGNMENT (dr),
2243 DR_TARGET_ALIGNMENT (dr_info)))
2244 {
2245 do_versioning = false;
2246 break;
2247 }
2248
2249 /* The rightmost bits of an aligned address must be zeros.
2250 Construct the mask needed for this test. For example,
2251 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252 mask must be 15 = 0xf. */
2253 mask = size - 1;
2254
2255 /* FORNOW: use the same mask to test all potentially unaligned
2256 references in the loop. The vectorizer currently supports
2257 a single vector size, see the reference to
2258 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259 vectorization factor is computed. */
2260 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2264 }
2265 }
2266
2267 /* Versioning requires at least one misaligned data reference. */
2268 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269 do_versioning = false;
2270 else if (!do_versioning)
2271 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2272 }
2273
2274 if (do_versioning)
2275 {
2276 vec<stmt_vec_info> may_misalign_stmts
2277 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278 stmt_vec_info stmt_info;
2279
2280 /* It can now be assumed that the data references in the statements
2281 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282 of the loop being vectorized. */
2283 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2284 {
2285 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286 SET_DR_MISALIGNMENT (dr_info, 0);
2287 if (dump_enabled_p ())
2288 dump_printf_loc (MSG_NOTE, vect_location,
2289 "Alignment of access forced using versioning.\n");
2290 }
2291
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
2294 "Versioning for alignment will be applied.\n");
2295
2296 /* Peeling and versioning can't be done together at this time. */
2297 gcc_assert (! (do_peeling && do_versioning));
2298
2299 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300 gcc_assert (stat);
2301 return stat;
2302 }
2303
2304 /* This point is reached if neither peeling nor versioning is being done. */
2305 gcc_assert (! (do_peeling || do_versioning));
2306
2307 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308 return stat;
2309 }
2310
2311
2312 /* Function vect_find_same_alignment_drs.
2313
2314 Update group and alignment relations in VINFO according to the chosen
2315 vectorization factor. */
2316
2317 static void
2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2319 {
2320 struct data_reference *dra = DDR_A (ddr);
2321 struct data_reference *drb = DDR_B (ddr);
2322 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2326
2327 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328 return;
2329
2330 if (dra == drb)
2331 return;
2332
2333 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335 return;
2336
2337 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340 return;
2341
2342 /* Two references with distance zero have the same alignment. */
2343 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344 - wi::to_poly_offset (DR_INIT (drb)));
2345 if (maybe_ne (diff, 0))
2346 {
2347 /* Get the wider of the two alignments. */
2348 poly_uint64 align_a =
2349 exact_div (vect_calculate_target_alignment (dr_info_a),
2350 BITS_PER_UNIT);
2351 poly_uint64 align_b =
2352 exact_div (vect_calculate_target_alignment (dr_info_b),
2353 BITS_PER_UNIT);
2354 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355 if (!align_a.is_constant (&align_a_c)
2356 || !align_b.is_constant (&align_b_c))
2357 return;
2358
2359 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2360
2361 /* Require the gap to be a multiple of the larger vector alignment. */
2362 if (!multiple_p (diff, max_align))
2363 return;
2364 }
2365
2366 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_NOTE, vect_location,
2370 "accesses have the same alignment: %T and %T\n",
2371 DR_REF (dra), DR_REF (drb));
2372 }
2373
2374
2375 /* Function vect_analyze_data_refs_alignment
2376
2377 Analyze the alignment of the data-references in the loop.
2378 Return FALSE if a data reference is found that cannot be vectorized. */
2379
2380 opt_result
2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2382 {
2383 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2384
2385 /* Mark groups of data references with same alignment using
2386 data dependence information. */
2387 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388 struct data_dependence_relation *ddr;
2389 unsigned int i;
2390
2391 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392 vect_find_same_alignment_drs (vinfo, ddr);
2393
2394 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395 struct data_reference *dr;
2396
2397 vect_record_base_alignments (vinfo);
2398 FOR_EACH_VEC_ELT (datarefs, i, dr)
2399 {
2400 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402 vect_compute_data_ref_alignment (dr_info);
2403 }
2404
2405 return opt_result::success ();
2406 }
2407
2408
2409 /* Analyze alignment of DRs of stmts in NODE. */
2410
2411 static bool
2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2413 {
2414 /* We vectorize from the first scalar stmt in the node unless
2415 the node is permuted in which case we start from the first
2416 element in the group. */
2417 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2421
2422 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423 vect_compute_data_ref_alignment (dr_info);
2424 /* For creating the data-ref pointer we need alignment of the
2425 first element anyway. */
2426 if (dr_info != first_dr_info)
2427 vect_compute_data_ref_alignment (first_dr_info);
2428 if (! verify_data_ref_alignment (dr_info))
2429 {
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "not vectorized: bad data alignment in basic "
2433 "block.\n");
2434 return false;
2435 }
2436
2437 return true;
2438 }
2439
2440 /* Function vect_slp_analyze_instance_alignment
2441
2442 Analyze the alignment of the data-references in the SLP instance.
2443 Return FALSE if a data reference is found that cannot be vectorized. */
2444
2445 bool
2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2447 {
2448 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2449
2450 slp_tree node;
2451 unsigned i;
2452 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453 if (! vect_slp_analyze_and_verify_node_alignment (node))
2454 return false;
2455
2456 node = SLP_INSTANCE_TREE (instance);
2457 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458 && ! vect_slp_analyze_and_verify_node_alignment
2459 (SLP_INSTANCE_TREE (instance)))
2460 return false;
2461
2462 return true;
2463 }
2464
2465
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467 accesses of legal size, step, etc. Detect gaps, single element
2468 interleaving, and other special cases. Set grouped access info.
2469 Collect groups of strided stores for further use in SLP analysis.
2470 Worker for vect_analyze_group_access. */
2471
2472 static bool
2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2474 {
2475 data_reference *dr = dr_info->dr;
2476 tree step = DR_STEP (dr);
2477 tree scalar_type = TREE_TYPE (DR_REF (dr));
2478 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479 stmt_vec_info stmt_info = dr_info->stmt;
2480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482 HOST_WIDE_INT dr_step = -1;
2483 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484 bool slp_impossible = false;
2485
2486 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487 size of the interleaving group (including gaps). */
2488 if (tree_fits_shwi_p (step))
2489 {
2490 dr_step = tree_to_shwi (step);
2491 /* Check that STEP is a multiple of type size. Otherwise there is
2492 a non-element-sized gap at the end of the group which we
2493 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494 ??? As we can handle non-constant step fine here we should
2495 simply remove uses of DR_GROUP_GAP between the last and first
2496 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2497 simply not include that gap. */
2498 if ((dr_step % type_size) != 0)
2499 {
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_NOTE, vect_location,
2502 "Step %T is not a multiple of the element size"
2503 " for %T\n",
2504 step, DR_REF (dr));
2505 return false;
2506 }
2507 groupsize = absu_hwi (dr_step) / type_size;
2508 }
2509 else
2510 groupsize = 0;
2511
2512 /* Not consecutive access is possible only if it is a part of interleaving. */
2513 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2514 {
2515 /* Check if it this DR is a part of interleaving, and is a single
2516 element of the group that is accessed in the loop. */
2517
2518 /* Gaps are supported only for loads. STEP must be a multiple of the type
2519 size. */
2520 if (DR_IS_READ (dr)
2521 && (dr_step % type_size) == 0
2522 && groupsize > 0)
2523 {
2524 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525 DR_GROUP_SIZE (stmt_info) = groupsize;
2526 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527 if (dump_enabled_p ())
2528 dump_printf_loc (MSG_NOTE, vect_location,
2529 "Detected single element interleaving %T"
2530 " step %T\n",
2531 DR_REF (dr), step);
2532
2533 return true;
2534 }
2535
2536 if (dump_enabled_p ())
2537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538 "not consecutive access %G", stmt_info->stmt);
2539
2540 if (bb_vinfo)
2541 {
2542 /* Mark the statement as unvectorizable. */
2543 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544 return true;
2545 }
2546
2547 if (dump_enabled_p ())
2548 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549 STMT_VINFO_STRIDED_P (stmt_info) = true;
2550 return true;
2551 }
2552
2553 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2554 {
2555 /* First stmt in the interleaving chain. Check the chain. */
2556 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557 struct data_reference *data_ref = dr;
2558 unsigned int count = 1;
2559 tree prev_init = DR_INIT (data_ref);
2560 HOST_WIDE_INT diff, gaps = 0;
2561
2562 /* By construction, all group members have INTEGER_CST DR_INITs. */
2563 while (next)
2564 {
2565 /* We never have the same DR multiple times. */
2566 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2568
2569 data_ref = STMT_VINFO_DATA_REF (next);
2570
2571 /* All group members have the same STEP by construction. */
2572 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573
2574 /* Check that the distance between two accesses is equal to the type
2575 size. Otherwise, we have gaps. */
2576 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577 - TREE_INT_CST_LOW (prev_init)) / type_size;
2578 if (diff != 1)
2579 {
2580 /* FORNOW: SLP of accesses with gaps is not supported. */
2581 slp_impossible = true;
2582 if (DR_IS_WRITE (data_ref))
2583 {
2584 if (dump_enabled_p ())
2585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586 "interleaved store with gaps\n");
2587 return false;
2588 }
2589
2590 gaps += diff - 1;
2591 }
2592
2593 last_accessed_element += diff;
2594
2595 /* Store the gap from the previous member of the group. If there is no
2596 gap in the access, DR_GROUP_GAP is always 1. */
2597 DR_GROUP_GAP (next) = diff;
2598
2599 prev_init = DR_INIT (data_ref);
2600 next = DR_GROUP_NEXT_ELEMENT (next);
2601 /* Count the number of data-refs in the chain. */
2602 count++;
2603 }
2604
2605 if (groupsize == 0)
2606 groupsize = count + gaps;
2607
2608 /* This could be UINT_MAX but as we are generating code in a very
2609 inefficient way we have to cap earlier. See PR78699 for example. */
2610 if (groupsize > 4096)
2611 {
2612 if (dump_enabled_p ())
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "group is too large\n");
2615 return false;
2616 }
2617
2618 /* Check that the size of the interleaving is equal to count for stores,
2619 i.e., that there are no gaps. */
2620 if (groupsize != count
2621 && !DR_IS_READ (dr))
2622 {
2623 groupsize = count;
2624 STMT_VINFO_STRIDED_P (stmt_info) = true;
2625 }
2626
2627 /* If there is a gap after the last load in the group it is the
2628 difference between the groupsize and the last accessed
2629 element.
2630 When there is no gap, this difference should be 0. */
2631 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2632
2633 DR_GROUP_SIZE (stmt_info) = groupsize;
2634 if (dump_enabled_p ())
2635 {
2636 dump_printf_loc (MSG_NOTE, vect_location,
2637 "Detected interleaving ");
2638 if (DR_IS_READ (dr))
2639 dump_printf (MSG_NOTE, "load ");
2640 else if (STMT_VINFO_STRIDED_P (stmt_info))
2641 dump_printf (MSG_NOTE, "strided store ");
2642 else
2643 dump_printf (MSG_NOTE, "store ");
2644 dump_printf (MSG_NOTE, "of size %u\n",
2645 (unsigned)groupsize);
2646 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648 while (next)
2649 {
2650 if (DR_GROUP_GAP (next) != 1)
2651 dump_printf_loc (MSG_NOTE, vect_location,
2652 "\t<gap of %d elements>\n",
2653 DR_GROUP_GAP (next) - 1);
2654 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655 next = DR_GROUP_NEXT_ELEMENT (next);
2656 }
2657 if (DR_GROUP_GAP (stmt_info) != 0)
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "\t<gap of %d elements>\n",
2660 DR_GROUP_GAP (stmt_info));
2661 }
2662
2663 /* SLP: create an SLP data structure for every interleaving group of
2664 stores for further analysis in vect_analyse_slp. */
2665 if (DR_IS_WRITE (dr) && !slp_impossible)
2666 {
2667 if (loop_vinfo)
2668 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669 if (bb_vinfo)
2670 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2671 }
2672 }
2673
2674 return true;
2675 }
2676
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678 accesses of legal size, step, etc. Detect gaps, single element
2679 interleaving, and other special cases. Set grouped access info.
2680 Collect groups of strided stores for further use in SLP analysis. */
2681
2682 static bool
2683 vect_analyze_group_access (dr_vec_info *dr_info)
2684 {
2685 if (!vect_analyze_group_access_1 (dr_info))
2686 {
2687 /* Dissolve the group if present. */
2688 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689 while (stmt_info)
2690 {
2691 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694 stmt_info = next;
2695 }
2696 return false;
2697 }
2698 return true;
2699 }
2700
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702 In case of non-consecutive accesses call vect_analyze_group_access() to
2703 analyze groups of accesses. */
2704
2705 static bool
2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2707 {
2708 data_reference *dr = dr_info->dr;
2709 tree step = DR_STEP (dr);
2710 tree scalar_type = TREE_TYPE (DR_REF (dr));
2711 stmt_vec_info stmt_info = dr_info->stmt;
2712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713 struct loop *loop = NULL;
2714
2715 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716 return true;
2717
2718 if (loop_vinfo)
2719 loop = LOOP_VINFO_LOOP (loop_vinfo);
2720
2721 if (loop_vinfo && !step)
2722 {
2723 if (dump_enabled_p ())
2724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725 "bad data-ref access in loop\n");
2726 return false;
2727 }
2728
2729 /* Allow loads with zero step in inner-loop vectorization. */
2730 if (loop_vinfo && integer_zerop (step))
2731 {
2732 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733 if (!nested_in_vect_loop_p (loop, stmt_info))
2734 return DR_IS_READ (dr);
2735 /* Allow references with zero step for outer loops marked
2736 with pragma omp simd only - it guarantees absence of
2737 loop-carried dependencies between inner loop iterations. */
2738 if (loop->safelen < 2)
2739 {
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_NOTE, vect_location,
2742 "zero step in inner loop of nest\n");
2743 return false;
2744 }
2745 }
2746
2747 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748 {
2749 /* Interleaved accesses are not yet supported within outer-loop
2750 vectorization for references in the inner-loop. */
2751 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752
2753 /* For the rest of the analysis we use the outer-loop step. */
2754 step = STMT_VINFO_DR_STEP (stmt_info);
2755 if (integer_zerop (step))
2756 {
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "zero step in outer loop.\n");
2760 return DR_IS_READ (dr);
2761 }
2762 }
2763
2764 /* Consecutive? */
2765 if (TREE_CODE (step) == INTEGER_CST)
2766 {
2767 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769 || (dr_step < 0
2770 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2771 {
2772 /* Mark that it is not interleaving. */
2773 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774 return true;
2775 }
2776 }
2777
2778 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2779 {
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_NOTE, vect_location,
2782 "grouped access in outer loop.\n");
2783 return false;
2784 }
2785
2786
2787 /* Assume this is a DR handled by non-constant strided load case. */
2788 if (TREE_CODE (step) != INTEGER_CST)
2789 return (STMT_VINFO_STRIDED_P (stmt_info)
2790 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791 || vect_analyze_group_access (dr_info)));
2792
2793 /* Not consecutive access - check if it's a part of interleaving group. */
2794 return vect_analyze_group_access (dr_info);
2795 }
2796
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798 suitable for grouping. */
2799
2800 static int
2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2802 {
2803 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805 int cmp;
2806
2807 /* Stabilize sort. */
2808 if (dra == drb)
2809 return 0;
2810
2811 /* DRs in different loops never belong to the same group. */
2812 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814 if (loopa != loopb)
2815 return loopa->num < loopb->num ? -1 : 1;
2816
2817 /* Ordering of DRs according to base. */
2818 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819 DR_BASE_ADDRESS (drb));
2820 if (cmp != 0)
2821 return cmp;
2822
2823 /* And according to DR_OFFSET. */
2824 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825 if (cmp != 0)
2826 return cmp;
2827
2828 /* Put reads before writes. */
2829 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830 return DR_IS_READ (dra) ? -1 : 1;
2831
2832 /* Then sort after access size. */
2833 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835 if (cmp != 0)
2836 return cmp;
2837
2838 /* And after step. */
2839 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840 if (cmp != 0)
2841 return cmp;
2842
2843 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2844 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845 if (cmp == 0)
2846 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847 return cmp;
2848 }
2849
2850 /* If OP is the result of a conversion, return the unconverted value,
2851 otherwise return null. */
2852
2853 static tree
2854 strip_conversion (tree op)
2855 {
2856 if (TREE_CODE (op) != SSA_NAME)
2857 return NULL_TREE;
2858 gimple *stmt = SSA_NAME_DEF_STMT (op);
2859 if (!is_gimple_assign (stmt)
2860 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861 return NULL_TREE;
2862 return gimple_assign_rhs1 (stmt);
2863 }
2864
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2867 be grouped in SLP mode. */
2868
2869 static bool
2870 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2871 bool allow_slp_p)
2872 {
2873 if (gimple_assign_single_p (stmt1_info->stmt))
2874 return gimple_assign_single_p (stmt2_info->stmt);
2875
2876 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2877 if (call1 && gimple_call_internal_p (call1))
2878 {
2879 /* Check for two masked loads or two masked stores. */
2880 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2881 if (!call2 || !gimple_call_internal_p (call2))
2882 return false;
2883 internal_fn ifn = gimple_call_internal_fn (call1);
2884 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885 return false;
2886 if (ifn != gimple_call_internal_fn (call2))
2887 return false;
2888
2889 /* Check that the masks are the same. Cope with casts of masks,
2890 like those created by build_mask_conversion. */
2891 tree mask1 = gimple_call_arg (call1, 2);
2892 tree mask2 = gimple_call_arg (call2, 2);
2893 if (!operand_equal_p (mask1, mask2, 0)
2894 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2895 {
2896 mask1 = strip_conversion (mask1);
2897 if (!mask1)
2898 return false;
2899 mask2 = strip_conversion (mask2);
2900 if (!mask2)
2901 return false;
2902 if (!operand_equal_p (mask1, mask2, 0))
2903 return false;
2904 }
2905 return true;
2906 }
2907
2908 return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913 Analyze the access pattern of all the data references in the loop.
2914
2915 FORNOW: the only access pattern that is considered vectorizable is a
2916 simple step 1 (consecutive) access.
2917
2918 FORNOW: handle only arrays and pointer accesses. */
2919
2920 opt_result
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923 unsigned int i;
2924 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925 struct data_reference *dr;
2926
2927 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929 if (datarefs.is_empty ())
2930 return opt_result::success ();
2931
2932 /* Sort the array of datarefs to make building the interleaving chains
2933 linear. Don't modify the original vector's order, it is needed for
2934 determining what dependencies are reversed. */
2935 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936 datarefs_copy.qsort (dr_group_sort_cmp);
2937 hash_set<stmt_vec_info> to_fixup;
2938
2939 /* Build the interleaving chains. */
2940 for (i = 0; i < datarefs_copy.length () - 1;)
2941 {
2942 data_reference_p dra = datarefs_copy[i];
2943 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2944 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2945 stmt_vec_info lastinfo = NULL;
2946 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2947 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948 {
2949 ++i;
2950 continue;
2951 }
2952 for (i = i + 1; i < datarefs_copy.length (); ++i)
2953 {
2954 data_reference_p drb = datarefs_copy[i];
2955 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2956 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2957 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2958 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2959 break;
2960
2961 /* ??? Imperfect sorting (non-compatible types, non-modulo
2962 accesses, same accesses) can lead to a group to be artificially
2963 split here as we don't just skip over those. If it really
2964 matters we can push those to a worklist and re-iterate
2965 over them. The we can just skip ahead to the next DR here. */
2966
2967 /* DRs in a different loop should not be put into the same
2968 interleaving group. */
2969 if (gimple_bb (DR_STMT (dra))->loop_father
2970 != gimple_bb (DR_STMT (drb))->loop_father)
2971 break;
2972
2973 /* Check that the data-refs have same first location (except init)
2974 and they are both either store or load (not load and store,
2975 not masked loads or stores). */
2976 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2977 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2978 DR_BASE_ADDRESS (drb)) != 0
2979 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2980 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2981 break;
2982
2983 /* Check that the data-refs have the same constant size. */
2984 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2985 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2986 if (!tree_fits_uhwi_p (sza)
2987 || !tree_fits_uhwi_p (szb)
2988 || !tree_int_cst_equal (sza, szb))
2989 break;
2990
2991 /* Check that the data-refs have the same step. */
2992 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2993 break;
2994
2995 /* Check the types are compatible.
2996 ??? We don't distinguish this during sorting. */
2997 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2998 TREE_TYPE (DR_REF (drb))))
2999 break;
3000
3001 /* Check that the DR_INITs are compile-time constants. */
3002 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3003 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3004 break;
3005
3006 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3007 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3008 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3009 HOST_WIDE_INT init_prev
3010 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3011 gcc_assert (init_a <= init_b
3012 && init_a <= init_prev
3013 && init_prev <= init_b);
3014
3015 /* Do not place the same access in the interleaving chain twice. */
3016 if (init_b == init_prev)
3017 {
3018 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3019 < gimple_uid (DR_STMT (drb)));
3020 /* Simply link in duplicates and fix up the chain below. */
3021 }
3022 else
3023 {
3024 /* If init_b == init_a + the size of the type * k, we have an
3025 interleaving, and DRA is accessed before DRB. */
3026 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027 if (type_size_a == 0
3028 || (init_b - init_a) % type_size_a != 0)
3029 break;
3030
3031 /* If we have a store, the accesses are adjacent. This splits
3032 groups into chunks we support (we don't support vectorization
3033 of stores with gaps). */
3034 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035 break;
3036
3037 /* If the step (if not zero or non-constant) is greater than the
3038 difference between data-refs' inits this splits groups into
3039 suitable sizes. */
3040 if (tree_fits_shwi_p (DR_STEP (dra)))
3041 {
3042 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043 if (step != 0 && step <= (init_b - init_a))
3044 break;
3045 }
3046 }
3047
3048 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 DR_IS_READ (dra)
3051 ? "Detected interleaving load %T and %T\n"
3052 : "Detected interleaving store %T and %T\n",
3053 DR_REF (dra), DR_REF (drb));
3054
3055 /* Link the found element into the group list. */
3056 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3057 {
3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3059 lastinfo = stmtinfo_a;
3060 }
3061 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3062 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3063 lastinfo = stmtinfo_b;
3064
3065 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3066 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3067
3068 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3069 dump_printf_loc (MSG_NOTE, vect_location,
3070 "Load suitable for SLP vectorization only.\n");
3071
3072 if (init_b == init_prev
3073 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3074 && dump_enabled_p ())
3075 dump_printf_loc (MSG_NOTE, vect_location,
3076 "Queuing group with duplicate access for fixup\n");
3077 }
3078 }
3079
3080 /* Fixup groups with duplicate entries by splitting it. */
3081 while (1)
3082 {
3083 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3084 if (!(it != to_fixup.end ()))
3085 break;
3086 stmt_vec_info grp = *it;
3087 to_fixup.remove (grp);
3088
3089 /* Find the earliest duplicate group member. */
3090 unsigned first_duplicate = -1u;
3091 stmt_vec_info next, g = grp;
3092 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3093 {
3094 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3095 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3096 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3097 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3098 g = next;
3099 }
3100 if (first_duplicate == -1U)
3101 continue;
3102
3103 /* Then move all stmts after the first duplicate to a new group.
3104 Note this is a heuristic but one with the property that *it
3105 is fixed up completely. */
3106 g = grp;
3107 stmt_vec_info newgroup = NULL, ng = grp;
3108 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3109 {
3110 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3111 {
3112 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3113 if (!newgroup)
3114 newgroup = next;
3115 else
3116 DR_GROUP_NEXT_ELEMENT (ng) = next;
3117 ng = next;
3118 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3119 }
3120 else
3121 g = DR_GROUP_NEXT_ELEMENT (g);
3122 }
3123 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3124
3125 /* Fixup the new group which still may contain duplicates. */
3126 to_fixup.add (newgroup);
3127 }
3128
3129 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3130 {
3131 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3132 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3133 && !vect_analyze_data_ref_access (dr_info))
3134 {
3135 if (dump_enabled_p ())
3136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3137 "not vectorized: complicated access pattern.\n");
3138
3139 if (is_a <bb_vec_info> (vinfo))
3140 {
3141 /* Mark the statement as not vectorizable. */
3142 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3143 continue;
3144 }
3145 else
3146 {
3147 datarefs_copy.release ();
3148 return opt_result::failure_at (dr_info->stmt->stmt,
3149 "not vectorized:"
3150 " complicated access pattern.\n");
3151 }
3152 }
3153 }
3154
3155 datarefs_copy.release ();
3156 return opt_result::success ();
3157 }
3158
3159 /* Function vect_vfa_segment_size.
3160
3161 Input:
3162 DR_INFO: The data reference.
3163 LENGTH_FACTOR: segment length to consider.
3164
3165 Return a value suitable for the dr_with_seg_len::seg_len field.
3166 This is the "distance travelled" by the pointer from the first
3167 iteration in the segment to the last. Note that it does not include
3168 the size of the access; in effect it only describes the first byte. */
3169
3170 static tree
3171 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3172 {
3173 length_factor = size_binop (MINUS_EXPR,
3174 fold_convert (sizetype, length_factor),
3175 size_one_node);
3176 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3177 length_factor);
3178 }
3179
3180 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3181 gives the worst-case number of bytes covered by the segment. */
3182
3183 static unsigned HOST_WIDE_INT
3184 vect_vfa_access_size (dr_vec_info *dr_info)
3185 {
3186 stmt_vec_info stmt_vinfo = dr_info->stmt;
3187 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3188 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3189 unsigned HOST_WIDE_INT access_size = ref_size;
3190 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3191 {
3192 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3193 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3194 }
3195 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3196 && (vect_supportable_dr_alignment (dr_info, false)
3197 == dr_explicit_realign_optimized))
3198 {
3199 /* We might access a full vector's worth. */
3200 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3201 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3202 }
3203 return access_size;
3204 }
3205
3206 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3207 describes. */
3208
3209 static unsigned int
3210 vect_vfa_align (dr_vec_info *dr_info)
3211 {
3212 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3213 }
3214
3215 /* Function vect_no_alias_p.
3216
3217 Given data references A and B with equal base and offset, see whether
3218 the alias relation can be decided at compilation time. Return 1 if
3219 it can and the references alias, 0 if it can and the references do
3220 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3221 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3222 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3223
3224 static int
3225 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3226 tree segment_length_a, tree segment_length_b,
3227 unsigned HOST_WIDE_INT access_size_a,
3228 unsigned HOST_WIDE_INT access_size_b)
3229 {
3230 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3231 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3232 poly_uint64 const_length_a;
3233 poly_uint64 const_length_b;
3234
3235 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3236 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3237 [a, a+12) */
3238 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3239 {
3240 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3241 offset_a = (offset_a + access_size_a) - const_length_a;
3242 }
3243 else
3244 const_length_a = tree_to_poly_uint64 (segment_length_a);
3245 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3246 {
3247 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3248 offset_b = (offset_b + access_size_b) - const_length_b;
3249 }
3250 else
3251 const_length_b = tree_to_poly_uint64 (segment_length_b);
3252
3253 const_length_a += access_size_a;
3254 const_length_b += access_size_b;
3255
3256 if (ranges_known_overlap_p (offset_a, const_length_a,
3257 offset_b, const_length_b))
3258 return 1;
3259
3260 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3261 offset_b, const_length_b))
3262 return 0;
3263
3264 return -1;
3265 }
3266
3267 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3268 in DDR is >= VF. */
3269
3270 static bool
3271 dependence_distance_ge_vf (data_dependence_relation *ddr,
3272 unsigned int loop_depth, poly_uint64 vf)
3273 {
3274 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3275 || DDR_NUM_DIST_VECTS (ddr) == 0)
3276 return false;
3277
3278 /* If the dependence is exact, we should have limited the VF instead. */
3279 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3280
3281 unsigned int i;
3282 lambda_vector dist_v;
3283 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3284 {
3285 HOST_WIDE_INT dist = dist_v[loop_depth];
3286 if (dist != 0
3287 && !(dist > 0 && DDR_REVERSED_P (ddr))
3288 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3289 return false;
3290 }
3291
3292 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_NOTE, vect_location,
3294 "dependence distance between %T and %T is >= VF\n",
3295 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3296
3297 return true;
3298 }
3299
3300 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3301
3302 static void
3303 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3304 {
3305 dump_printf (dump_kind, "%s (%T) >= ",
3306 lower_bound.unsigned_p ? "unsigned" : "abs",
3307 lower_bound.expr);
3308 dump_dec (dump_kind, lower_bound.min_value);
3309 }
3310
3311 /* Record that the vectorized loop requires the vec_lower_bound described
3312 by EXPR, UNSIGNED_P and MIN_VALUE. */
3313
3314 static void
3315 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3316 poly_uint64 min_value)
3317 {
3318 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3319 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3320 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3321 {
3322 unsigned_p &= lower_bounds[i].unsigned_p;
3323 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3324 if (lower_bounds[i].unsigned_p != unsigned_p
3325 || maybe_lt (lower_bounds[i].min_value, min_value))
3326 {
3327 lower_bounds[i].unsigned_p = unsigned_p;
3328 lower_bounds[i].min_value = min_value;
3329 if (dump_enabled_p ())
3330 {
3331 dump_printf_loc (MSG_NOTE, vect_location,
3332 "updating run-time check to ");
3333 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3334 dump_printf (MSG_NOTE, "\n");
3335 }
3336 }
3337 return;
3338 }
3339
3340 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3341 if (dump_enabled_p ())
3342 {
3343 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3344 dump_lower_bound (MSG_NOTE, lower_bound);
3345 dump_printf (MSG_NOTE, "\n");
3346 }
3347 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3348 }
3349
3350 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3351 will span fewer than GAP bytes. */
3352
3353 static bool
3354 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3355 poly_int64 gap)
3356 {
3357 stmt_vec_info stmt_info = dr_info->stmt;
3358 HOST_WIDE_INT count
3359 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3360 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3361 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3362 return (estimated_poly_value (gap)
3363 <= count * vect_get_scalar_dr_size (dr_info));
3364 }
3365
3366 /* Return true if we know that there is no alias between DR_INFO_A and
3367 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3368 When returning true, set *LOWER_BOUND_OUT to this N. */
3369
3370 static bool
3371 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3372 poly_uint64 *lower_bound_out)
3373 {
3374 /* Check that there is a constant gap of known sign between DR_A
3375 and DR_B. */
3376 data_reference *dr_a = dr_info_a->dr;
3377 data_reference *dr_b = dr_info_b->dr;
3378 poly_int64 init_a, init_b;
3379 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3380 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3381 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3382 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3383 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3384 || !ordered_p (init_a, init_b))
3385 return false;
3386
3387 /* Sort DR_A and DR_B by the address they access. */
3388 if (maybe_lt (init_b, init_a))
3389 {
3390 std::swap (init_a, init_b);
3391 std::swap (dr_info_a, dr_info_b);
3392 std::swap (dr_a, dr_b);
3393 }
3394
3395 /* If the two accesses could be dependent within a scalar iteration,
3396 make sure that we'd retain their order. */
3397 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3398 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3399 return false;
3400
3401 /* There is no alias if abs (DR_STEP) is greater than or equal to
3402 the bytes spanned by the combination of the two accesses. */
3403 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3404 return true;
3405 }
3406
3407 /* Function vect_prune_runtime_alias_test_list.
3408
3409 Prune a list of ddrs to be tested at run-time by versioning for alias.
3410 Merge several alias checks into one if possible.
3411 Return FALSE if resulting list of ddrs is longer then allowed by
3412 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3413
3414 opt_result
3415 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3416 {
3417 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3418 hash_set <tree_pair_hash> compared_objects;
3419
3420 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3421 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3422 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3423 vec<vec_object_pair> &check_unequal_addrs
3424 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3425 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3426 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3427
3428 ddr_p ddr;
3429 unsigned int i;
3430 tree length_factor;
3431
3432 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3433
3434 /* Step values are irrelevant for aliasing if the number of vector
3435 iterations is equal to the number of scalar iterations (which can
3436 happen for fully-SLP loops). */
3437 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3438
3439 if (!ignore_step_p)
3440 {
3441 /* Convert the checks for nonzero steps into bound tests. */
3442 tree value;
3443 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3444 vect_check_lower_bound (loop_vinfo, value, true, 1);
3445 }
3446
3447 if (may_alias_ddrs.is_empty ())
3448 return opt_result::success ();
3449
3450 comp_alias_ddrs.create (may_alias_ddrs.length ());
3451
3452 unsigned int loop_depth
3453 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3454 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3455
3456 /* First, we collect all data ref pairs for aliasing checks. */
3457 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3458 {
3459 int comp_res;
3460 poly_uint64 lower_bound;
3461 tree segment_length_a, segment_length_b;
3462 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3463 unsigned int align_a, align_b;
3464
3465 /* Ignore the alias if the VF we chose ended up being no greater
3466 than the dependence distance. */
3467 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3468 continue;
3469
3470 if (DDR_OBJECT_A (ddr))
3471 {
3472 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3473 if (!compared_objects.add (new_pair))
3474 {
3475 if (dump_enabled_p ())
3476 dump_printf_loc (MSG_NOTE, vect_location,
3477 "checking that %T and %T"
3478 " have different addresses\n",
3479 new_pair.first, new_pair.second);
3480 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3481 }
3482 continue;
3483 }
3484
3485 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3486 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3487
3488 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3489 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3490
3491 /* Skip the pair if inter-iteration dependencies are irrelevant
3492 and intra-iteration dependencies are guaranteed to be honored. */
3493 if (ignore_step_p
3494 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3495 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3496 &lower_bound)))
3497 {
3498 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_NOTE, vect_location,
3500 "no need for alias check between "
3501 "%T and %T when VF is 1\n",
3502 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3503 continue;
3504 }
3505
3506 /* See whether we can handle the alias using a bounds check on
3507 the step, and whether that's likely to be the best approach.
3508 (It might not be, for example, if the minimum step is much larger
3509 than the number of bytes handled by one vector iteration.) */
3510 if (!ignore_step_p
3511 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3512 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3513 &lower_bound)
3514 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3515 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3516 {
3517 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3518 if (dump_enabled_p ())
3519 {
3520 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3521 "%T and %T when the step %T is outside ",
3522 DR_REF (dr_info_a->dr),
3523 DR_REF (dr_info_b->dr),
3524 DR_STEP (dr_info_a->dr));
3525 if (unsigned_p)
3526 dump_printf (MSG_NOTE, "[0");
3527 else
3528 {
3529 dump_printf (MSG_NOTE, "(");
3530 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3531 }
3532 dump_printf (MSG_NOTE, ", ");
3533 dump_dec (MSG_NOTE, lower_bound);
3534 dump_printf (MSG_NOTE, ")\n");
3535 }
3536 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3537 unsigned_p, lower_bound);
3538 continue;
3539 }
3540
3541 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3542 if (dr_group_first_a)
3543 {
3544 stmt_info_a = dr_group_first_a;
3545 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3546 }
3547
3548 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3549 if (dr_group_first_b)
3550 {
3551 stmt_info_b = dr_group_first_b;
3552 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3553 }
3554
3555 if (ignore_step_p)
3556 {
3557 segment_length_a = size_zero_node;
3558 segment_length_b = size_zero_node;
3559 }
3560 else
3561 {
3562 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3563 DR_STEP (dr_info_b->dr), 0))
3564 length_factor = scalar_loop_iters;
3565 else
3566 length_factor = size_int (vect_factor);
3567 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3568 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3569 }
3570 access_size_a = vect_vfa_access_size (dr_info_a);
3571 access_size_b = vect_vfa_access_size (dr_info_b);
3572 align_a = vect_vfa_align (dr_info_a);
3573 align_b = vect_vfa_align (dr_info_b);
3574
3575 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3576 DR_BASE_ADDRESS (dr_info_b->dr));
3577 if (comp_res == 0)
3578 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3579 DR_OFFSET (dr_info_b->dr));
3580
3581 /* See whether the alias is known at compilation time. */
3582 if (comp_res == 0
3583 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3584 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3585 && poly_int_tree_p (segment_length_a)
3586 && poly_int_tree_p (segment_length_b))
3587 {
3588 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3589 segment_length_a,
3590 segment_length_b,
3591 access_size_a,
3592 access_size_b);
3593 if (res >= 0 && dump_enabled_p ())
3594 {
3595 dump_printf_loc (MSG_NOTE, vect_location,
3596 "can tell at compile time that %T and %T",
3597 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3598 if (res == 0)
3599 dump_printf (MSG_NOTE, " do not alias\n");
3600 else
3601 dump_printf (MSG_NOTE, " alias\n");
3602 }
3603
3604 if (res == 0)
3605 continue;
3606
3607 if (res == 1)
3608 return opt_result::failure_at (stmt_info_b->stmt,
3609 "not vectorized:"
3610 " compilation time alias: %G%G",
3611 stmt_info_a->stmt,
3612 stmt_info_b->stmt);
3613 }
3614
3615 dr_with_seg_len_pair_t dr_with_seg_len_pair
3616 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3617 access_size_a, align_a),
3618 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3619 access_size_b, align_b));
3620
3621 /* Canonicalize pairs by sorting the two DR members. */
3622 if (comp_res > 0)
3623 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3624
3625 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3626 }
3627
3628 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3629
3630 unsigned int count = (comp_alias_ddrs.length ()
3631 + check_unequal_addrs.length ());
3632
3633 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_NOTE, vect_location,
3635 "improved number of alias checks from %d to %d\n",
3636 may_alias_ddrs.length (), count);
3637 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3638 return opt_result::failure_at
3639 (vect_location,
3640 "number of versioning for alias "
3641 "run-time tests exceeds %d "
3642 "(--param vect-max-version-for-alias-checks)\n",
3643 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3644
3645 return opt_result::success ();
3646 }
3647
3648 /* Check whether we can use an internal function for a gather load
3649 or scatter store. READ_P is true for loads and false for stores.
3650 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3651 the type of the memory elements being loaded or stored. OFFSET_BITS
3652 is the number of bits in each scalar offset and OFFSET_SIGN is the
3653 sign of the offset. SCALE is the amount by which the offset should
3654 be multiplied *after* it has been converted to address width.
3655
3656 Return true if the function is supported, storing the function
3657 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3658
3659 bool
3660 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3661 tree memory_type, unsigned int offset_bits,
3662 signop offset_sign, int scale,
3663 internal_fn *ifn_out, tree *element_type_out)
3664 {
3665 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3666 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3667 if (offset_bits > element_bits)
3668 /* Internal functions require the offset to be the same width as
3669 the vector elements. We can extend narrower offsets, but it isn't
3670 safe to truncate wider offsets. */
3671 return false;
3672
3673 if (element_bits != memory_bits)
3674 /* For now the vector elements must be the same width as the
3675 memory elements. */
3676 return false;
3677
3678 /* Work out which function we need. */
3679 internal_fn ifn;
3680 if (read_p)
3681 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3682 else
3683 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3684
3685 /* Test whether the target supports this combination. */
3686 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3687 offset_sign, scale))
3688 return false;
3689
3690 *ifn_out = ifn;
3691 *element_type_out = TREE_TYPE (vectype);
3692 return true;
3693 }
3694
3695 /* STMT_INFO is a call to an internal gather load or scatter store function.
3696 Describe the operation in INFO. */
3697
3698 static void
3699 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3700 gather_scatter_info *info)
3701 {
3702 gcall *call = as_a <gcall *> (stmt_info->stmt);
3703 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3704 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3705
3706 info->ifn = gimple_call_internal_fn (call);
3707 info->decl = NULL_TREE;
3708 info->base = gimple_call_arg (call, 0);
3709 info->offset = gimple_call_arg (call, 1);
3710 info->offset_dt = vect_unknown_def_type;
3711 info->offset_vectype = NULL_TREE;
3712 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3713 info->element_type = TREE_TYPE (vectype);
3714 info->memory_type = TREE_TYPE (DR_REF (dr));
3715 }
3716
3717 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3718 gather load or scatter store. Describe the operation in *INFO if so. */
3719
3720 bool
3721 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3722 gather_scatter_info *info)
3723 {
3724 HOST_WIDE_INT scale = 1;
3725 poly_int64 pbitpos, pbitsize;
3726 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3727 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3728 tree offtype = NULL_TREE;
3729 tree decl = NULL_TREE, base, off;
3730 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3731 tree memory_type = TREE_TYPE (DR_REF (dr));
3732 machine_mode pmode;
3733 int punsignedp, reversep, pvolatilep = 0;
3734 internal_fn ifn;
3735 tree element_type;
3736 bool masked_p = false;
3737
3738 /* See whether this is already a call to a gather/scatter internal function.
3739 If not, see whether it's a masked load or store. */
3740 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3741 if (call && gimple_call_internal_p (call))
3742 {
3743 ifn = gimple_call_internal_fn (call);
3744 if (internal_gather_scatter_fn_p (ifn))
3745 {
3746 vect_describe_gather_scatter_call (stmt_info, info);
3747 return true;
3748 }
3749 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3750 }
3751
3752 /* True if we should aim to use internal functions rather than
3753 built-in functions. */
3754 bool use_ifn_p = (DR_IS_READ (dr)
3755 ? supports_vec_gather_load_p ()
3756 : supports_vec_scatter_store_p ());
3757
3758 base = DR_REF (dr);
3759 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3760 see if we can use the def stmt of the address. */
3761 if (masked_p
3762 && TREE_CODE (base) == MEM_REF
3763 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3764 && integer_zerop (TREE_OPERAND (base, 1))
3765 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3766 {
3767 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3768 if (is_gimple_assign (def_stmt)
3769 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3770 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3771 }
3772
3773 /* The gather and scatter builtins need address of the form
3774 loop_invariant + vector * {1, 2, 4, 8}
3775 or
3776 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3777 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3778 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3779 multiplications and additions in it. To get a vector, we need
3780 a single SSA_NAME that will be defined in the loop and will
3781 contain everything that is not loop invariant and that can be
3782 vectorized. The following code attempts to find such a preexistng
3783 SSA_NAME OFF and put the loop invariants into a tree BASE
3784 that can be gimplified before the loop. */
3785 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3786 &punsignedp, &reversep, &pvolatilep);
3787 if (reversep)
3788 return false;
3789
3790 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3791
3792 if (TREE_CODE (base) == MEM_REF)
3793 {
3794 if (!integer_zerop (TREE_OPERAND (base, 1)))
3795 {
3796 if (off == NULL_TREE)
3797 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3798 else
3799 off = size_binop (PLUS_EXPR, off,
3800 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3801 }
3802 base = TREE_OPERAND (base, 0);
3803 }
3804 else
3805 base = build_fold_addr_expr (base);
3806
3807 if (off == NULL_TREE)
3808 off = size_zero_node;
3809
3810 /* If base is not loop invariant, either off is 0, then we start with just
3811 the constant offset in the loop invariant BASE and continue with base
3812 as OFF, otherwise give up.
3813 We could handle that case by gimplifying the addition of base + off
3814 into some SSA_NAME and use that as off, but for now punt. */
3815 if (!expr_invariant_in_loop_p (loop, base))
3816 {
3817 if (!integer_zerop (off))
3818 return false;
3819 off = base;
3820 base = size_int (pbytepos);
3821 }
3822 /* Otherwise put base + constant offset into the loop invariant BASE
3823 and continue with OFF. */
3824 else
3825 {
3826 base = fold_convert (sizetype, base);
3827 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3828 }
3829
3830 /* OFF at this point may be either a SSA_NAME or some tree expression
3831 from get_inner_reference. Try to peel off loop invariants from it
3832 into BASE as long as possible. */
3833 STRIP_NOPS (off);
3834 while (offtype == NULL_TREE)
3835 {
3836 enum tree_code code;
3837 tree op0, op1, add = NULL_TREE;
3838
3839 if (TREE_CODE (off) == SSA_NAME)
3840 {
3841 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3842
3843 if (expr_invariant_in_loop_p (loop, off))
3844 return false;
3845
3846 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3847 break;
3848
3849 op0 = gimple_assign_rhs1 (def_stmt);
3850 code = gimple_assign_rhs_code (def_stmt);
3851 op1 = gimple_assign_rhs2 (def_stmt);
3852 }
3853 else
3854 {
3855 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3856 return false;
3857 code = TREE_CODE (off);
3858 extract_ops_from_tree (off, &code, &op0, &op1);
3859 }
3860 switch (code)
3861 {
3862 case POINTER_PLUS_EXPR:
3863 case PLUS_EXPR:
3864 if (expr_invariant_in_loop_p (loop, op0))
3865 {
3866 add = op0;
3867 off = op1;
3868 do_add:
3869 add = fold_convert (sizetype, add);
3870 if (scale != 1)
3871 add = size_binop (MULT_EXPR, add, size_int (scale));
3872 base = size_binop (PLUS_EXPR, base, add);
3873 continue;
3874 }
3875 if (expr_invariant_in_loop_p (loop, op1))
3876 {
3877 add = op1;
3878 off = op0;
3879 goto do_add;
3880 }
3881 break;
3882 case MINUS_EXPR:
3883 if (expr_invariant_in_loop_p (loop, op1))
3884 {
3885 add = fold_convert (sizetype, op1);
3886 add = size_binop (MINUS_EXPR, size_zero_node, add);
3887 off = op0;
3888 goto do_add;
3889 }
3890 break;
3891 case MULT_EXPR:
3892 if (scale == 1 && tree_fits_shwi_p (op1))
3893 {
3894 int new_scale = tree_to_shwi (op1);
3895 /* Only treat this as a scaling operation if the target
3896 supports it. */
3897 if (use_ifn_p
3898 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3899 vectype, memory_type, 1,
3900 TYPE_SIGN (TREE_TYPE (op0)),
3901 new_scale, &ifn,
3902 &element_type))
3903 break;
3904 scale = new_scale;
3905 off = op0;
3906 continue;
3907 }
3908 break;
3909 case SSA_NAME:
3910 off = op0;
3911 continue;
3912 CASE_CONVERT:
3913 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3914 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3915 break;
3916 if (TYPE_PRECISION (TREE_TYPE (op0))
3917 == TYPE_PRECISION (TREE_TYPE (off)))
3918 {
3919 off = op0;
3920 continue;
3921 }
3922
3923 /* The internal functions need the offset to be the same width
3924 as the elements of VECTYPE. Don't include operations that
3925 cast the offset from that width to a different width. */
3926 if (use_ifn_p
3927 && (int_size_in_bytes (TREE_TYPE (vectype))
3928 == int_size_in_bytes (TREE_TYPE (off))))
3929 break;
3930
3931 if (TYPE_PRECISION (TREE_TYPE (op0))
3932 < TYPE_PRECISION (TREE_TYPE (off)))
3933 {
3934 off = op0;
3935 offtype = TREE_TYPE (off);
3936 STRIP_NOPS (off);
3937 continue;
3938 }
3939 break;
3940 default:
3941 break;
3942 }
3943 break;
3944 }
3945
3946 /* If at the end OFF still isn't a SSA_NAME or isn't
3947 defined in the loop, punt. */
3948 if (TREE_CODE (off) != SSA_NAME
3949 || expr_invariant_in_loop_p (loop, off))
3950 return false;
3951
3952 if (offtype == NULL_TREE)
3953 offtype = TREE_TYPE (off);
3954
3955 if (use_ifn_p)
3956 {
3957 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3958 memory_type, TYPE_PRECISION (offtype),
3959 TYPE_SIGN (offtype), scale, &ifn,
3960 &element_type))
3961 return false;
3962 }
3963 else
3964 {
3965 if (DR_IS_READ (dr))
3966 {
3967 if (targetm.vectorize.builtin_gather)
3968 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3969 }
3970 else
3971 {
3972 if (targetm.vectorize.builtin_scatter)
3973 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3974 }
3975
3976 if (!decl)
3977 return false;
3978
3979 ifn = IFN_LAST;
3980 element_type = TREE_TYPE (vectype);
3981 }
3982
3983 info->ifn = ifn;
3984 info->decl = decl;
3985 info->base = base;
3986 info->offset = off;
3987 info->offset_dt = vect_unknown_def_type;
3988 info->offset_vectype = NULL_TREE;
3989 info->scale = scale;
3990 info->element_type = element_type;
3991 info->memory_type = memory_type;
3992 return true;
3993 }
3994
3995 /* Find the data references in STMT, analyze them with respect to LOOP and
3996 append them to DATAREFS. Return false if datarefs in this stmt cannot
3997 be handled. */
3998
3999 opt_result
4000 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4001 vec<data_reference_p> *datarefs)
4002 {
4003 /* We can ignore clobbers for dataref analysis - they are removed during
4004 loop vectorization and BB vectorization checks dependences with a
4005 stmt walk. */
4006 if (gimple_clobber_p (stmt))
4007 return opt_result::success ();
4008
4009 if (gimple_has_volatile_ops (stmt))
4010 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4011 stmt);
4012
4013 if (stmt_can_throw_internal (cfun, stmt))
4014 return opt_result::failure_at (stmt,
4015 "not vectorized:"
4016 " statement can throw an exception: %G",
4017 stmt);
4018
4019 auto_vec<data_reference_p, 2> refs;
4020 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4021 if (!res)
4022 return res;
4023
4024 if (refs.is_empty ())
4025 return opt_result::success ();
4026
4027 if (refs.length () > 1)
4028 return opt_result::failure_at (stmt,
4029 "not vectorized:"
4030 " more than one data ref in stmt: %G", stmt);
4031
4032 if (gcall *call = dyn_cast <gcall *> (stmt))
4033 if (!gimple_call_internal_p (call)
4034 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4035 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4036 return opt_result::failure_at (stmt,
4037 "not vectorized: dr in a call %G", stmt);
4038
4039 data_reference_p dr = refs.pop ();
4040 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4041 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4042 return opt_result::failure_at (stmt,
4043 "not vectorized:"
4044 " statement is bitfield access %G", stmt);
4045
4046 if (DR_BASE_ADDRESS (dr)
4047 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4048 return opt_result::failure_at (stmt,
4049 "not vectorized:"
4050 " base addr of dr is a constant\n");
4051
4052 /* Check whether this may be a SIMD lane access and adjust the
4053 DR to make it easier for us to handle it. */
4054 if (loop
4055 && loop->simduid
4056 && (!DR_BASE_ADDRESS (dr)
4057 || !DR_OFFSET (dr)
4058 || !DR_INIT (dr)
4059 || !DR_STEP (dr)))
4060 {
4061 struct data_reference *newdr
4062 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4063 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4064 if (DR_BASE_ADDRESS (newdr)
4065 && DR_OFFSET (newdr)
4066 && DR_INIT (newdr)
4067 && DR_STEP (newdr)
4068 && integer_zerop (DR_STEP (newdr)))
4069 {
4070 tree off = DR_OFFSET (newdr);
4071 STRIP_NOPS (off);
4072 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4073 && TREE_CODE (off) == MULT_EXPR
4074 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4075 {
4076 tree step = TREE_OPERAND (off, 1);
4077 off = TREE_OPERAND (off, 0);
4078 STRIP_NOPS (off);
4079 if (CONVERT_EXPR_P (off)
4080 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4081 < TYPE_PRECISION (TREE_TYPE (off))))
4082 off = TREE_OPERAND (off, 0);
4083 if (TREE_CODE (off) == SSA_NAME)
4084 {
4085 gimple *def = SSA_NAME_DEF_STMT (off);
4086 tree reft = TREE_TYPE (DR_REF (newdr));
4087 if (is_gimple_call (def)
4088 && gimple_call_internal_p (def)
4089 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4090 {
4091 tree arg = gimple_call_arg (def, 0);
4092 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4093 arg = SSA_NAME_VAR (arg);
4094 if (arg == loop->simduid
4095 /* For now. */
4096 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4097 {
4098 DR_OFFSET (newdr) = ssize_int (0);
4099 DR_STEP (newdr) = step;
4100 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4101 DR_STEP_ALIGNMENT (newdr)
4102 = highest_pow2_factor (step);
4103 /* Mark as simd-lane access. */
4104 newdr->aux = (void *)-1;
4105 free_data_ref (dr);
4106 datarefs->safe_push (newdr);
4107 return opt_result::success ();
4108 }
4109 }
4110 }
4111 }
4112 }
4113 free_data_ref (newdr);
4114 }
4115
4116 datarefs->safe_push (dr);
4117 return opt_result::success ();
4118 }
4119
4120 /* Function vect_analyze_data_refs.
4121
4122 Find all the data references in the loop or basic block.
4123
4124 The general structure of the analysis of data refs in the vectorizer is as
4125 follows:
4126 1- vect_analyze_data_refs(loop/bb): call
4127 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4128 in the loop/bb and their dependences.
4129 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4130 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4131 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4132
4133 */
4134
4135 opt_result
4136 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4137 {
4138 struct loop *loop = NULL;
4139 unsigned int i;
4140 struct data_reference *dr;
4141 tree scalar_type;
4142
4143 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4144
4145 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4146 loop = LOOP_VINFO_LOOP (loop_vinfo);
4147
4148 /* Go through the data-refs, check that the analysis succeeded. Update
4149 pointer from stmt_vec_info struct to DR and vectype. */
4150
4151 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4152 FOR_EACH_VEC_ELT (datarefs, i, dr)
4153 {
4154 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4155 poly_uint64 vf;
4156
4157 gcc_assert (DR_REF (dr));
4158 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4159 gcc_assert (!stmt_info->dr_aux.dr);
4160 stmt_info->dr_aux.dr = dr;
4161 stmt_info->dr_aux.stmt = stmt_info;
4162
4163 /* Check that analysis of the data-ref succeeded. */
4164 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4165 || !DR_STEP (dr))
4166 {
4167 bool maybe_gather
4168 = DR_IS_READ (dr)
4169 && !TREE_THIS_VOLATILE (DR_REF (dr))
4170 && (targetm.vectorize.builtin_gather != NULL
4171 || supports_vec_gather_load_p ());
4172 bool maybe_scatter
4173 = DR_IS_WRITE (dr)
4174 && !TREE_THIS_VOLATILE (DR_REF (dr))
4175 && (targetm.vectorize.builtin_scatter != NULL
4176 || supports_vec_scatter_store_p ());
4177
4178 /* If target supports vector gather loads or scatter stores,
4179 see if they can't be used. */
4180 if (is_a <loop_vec_info> (vinfo)
4181 && !nested_in_vect_loop_p (loop, stmt_info))
4182 {
4183 if (maybe_gather || maybe_scatter)
4184 {
4185 if (maybe_gather)
4186 gatherscatter = GATHER;
4187 else
4188 gatherscatter = SCATTER;
4189 }
4190 }
4191
4192 if (gatherscatter == SG_NONE)
4193 {
4194 if (dump_enabled_p ())
4195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4196 "not vectorized: data ref analysis "
4197 "failed %G", stmt_info->stmt);
4198 if (is_a <bb_vec_info> (vinfo))
4199 {
4200 /* In BB vectorization the ref can still participate
4201 in dependence analysis, we just can't vectorize it. */
4202 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4203 continue;
4204 }
4205 return opt_result::failure_at (stmt_info->stmt,
4206 "not vectorized:"
4207 " data ref analysis failed: %G",
4208 stmt_info->stmt);
4209 }
4210 }
4211
4212 /* See if this was detected as SIMD lane access. */
4213 if (dr->aux == (void *)-1)
4214 {
4215 if (nested_in_vect_loop_p (loop, stmt_info))
4216 return opt_result::failure_at (stmt_info->stmt,
4217 "not vectorized:"
4218 " data ref analysis failed: %G",
4219 stmt_info->stmt);
4220 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4221 }
4222
4223 tree base = get_base_address (DR_REF (dr));
4224 if (base && VAR_P (base) && DECL_NONALIASED (base))
4225 {
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228 "not vectorized: base object not addressable "
4229 "for stmt: %G", stmt_info->stmt);
4230 if (is_a <bb_vec_info> (vinfo))
4231 {
4232 /* In BB vectorization the ref can still participate
4233 in dependence analysis, we just can't vectorize it. */
4234 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4235 continue;
4236 }
4237 return opt_result::failure_at (stmt_info->stmt,
4238 "not vectorized: base object not"
4239 " addressable for stmt: %G",
4240 stmt_info->stmt);
4241 }
4242
4243 if (is_a <loop_vec_info> (vinfo)
4244 && DR_STEP (dr)
4245 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4246 {
4247 if (nested_in_vect_loop_p (loop, stmt_info))
4248 return opt_result::failure_at (stmt_info->stmt,
4249 "not vectorized:"
4250 "not suitable for strided load %G",
4251 stmt_info->stmt);
4252 STMT_VINFO_STRIDED_P (stmt_info) = true;
4253 }
4254
4255 /* Update DR field in stmt_vec_info struct. */
4256
4257 /* If the dataref is in an inner-loop of the loop that is considered for
4258 for vectorization, we also want to analyze the access relative to
4259 the outer-loop (DR contains information only relative to the
4260 inner-most enclosing loop). We do that by building a reference to the
4261 first location accessed by the inner-loop, and analyze it relative to
4262 the outer-loop. */
4263 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4264 {
4265 /* Build a reference to the first location accessed by the
4266 inner loop: *(BASE + INIT + OFFSET). By construction,
4267 this address must be invariant in the inner loop, so we
4268 can consider it as being used in the outer loop. */
4269 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4270 tree offset = unshare_expr (DR_OFFSET (dr));
4271 tree init = unshare_expr (DR_INIT (dr));
4272 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4273 init, offset);
4274 tree init_addr = fold_build_pointer_plus (base, init_offset);
4275 tree init_ref = build_fold_indirect_ref (init_addr);
4276
4277 if (dump_enabled_p ())
4278 dump_printf_loc (MSG_NOTE, vect_location,
4279 "analyze in outer loop: %T\n", init_ref);
4280
4281 opt_result res
4282 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4283 init_ref, loop, stmt_info->stmt);
4284 if (!res)
4285 /* dr_analyze_innermost already explained the failure. */
4286 return res;
4287
4288 if (dump_enabled_p ())
4289 dump_printf_loc (MSG_NOTE, vect_location,
4290 "\touter base_address: %T\n"
4291 "\touter offset from base address: %T\n"
4292 "\touter constant offset from base address: %T\n"
4293 "\touter step: %T\n"
4294 "\touter base alignment: %d\n\n"
4295 "\touter base misalignment: %d\n"
4296 "\touter offset alignment: %d\n"
4297 "\touter step alignment: %d\n",
4298 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4299 STMT_VINFO_DR_OFFSET (stmt_info),
4300 STMT_VINFO_DR_INIT (stmt_info),
4301 STMT_VINFO_DR_STEP (stmt_info),
4302 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4303 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4304 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4305 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4306 }
4307
4308 /* Set vectype for STMT. */
4309 scalar_type = TREE_TYPE (DR_REF (dr));
4310 STMT_VINFO_VECTYPE (stmt_info)
4311 = get_vectype_for_scalar_type (scalar_type);
4312 if (!STMT_VINFO_VECTYPE (stmt_info))
4313 {
4314 if (dump_enabled_p ())
4315 {
4316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4317 "not vectorized: no vectype for stmt: %G",
4318 stmt_info->stmt);
4319 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4321 scalar_type);
4322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4323 }
4324
4325 if (is_a <bb_vec_info> (vinfo))
4326 {
4327 /* No vector type is fine, the ref can still participate
4328 in dependence analysis, we just can't vectorize it. */
4329 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4330 continue;
4331 }
4332 return opt_result::failure_at (stmt_info->stmt,
4333 "not vectorized:"
4334 " no vectype for stmt: %G"
4335 " scalar_type: %T\n",
4336 stmt_info->stmt, scalar_type);
4337 }
4338 else
4339 {
4340 if (dump_enabled_p ())
4341 dump_printf_loc (MSG_NOTE, vect_location,
4342 "got vectype for stmt: %G%T\n",
4343 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4344 }
4345
4346 /* Adjust the minimal vectorization factor according to the
4347 vector type. */
4348 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4349 *min_vf = upper_bound (*min_vf, vf);
4350
4351 if (gatherscatter != SG_NONE)
4352 {
4353 gather_scatter_info gs_info;
4354 if (!vect_check_gather_scatter (stmt_info,
4355 as_a <loop_vec_info> (vinfo),
4356 &gs_info)
4357 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4358 return opt_result::failure_at
4359 (stmt_info->stmt,
4360 (gatherscatter == GATHER) ?
4361 "not vectorized: not suitable for gather load %G" :
4362 "not vectorized: not suitable for scatter store %G",
4363 stmt_info->stmt);
4364 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4365 }
4366 }
4367
4368 /* We used to stop processing and prune the list here. Verify we no
4369 longer need to. */
4370 gcc_assert (i == datarefs.length ());
4371
4372 return opt_result::success ();
4373 }
4374
4375
4376 /* Function vect_get_new_vect_var.
4377
4378 Returns a name for a new variable. The current naming scheme appends the
4379 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4380 the name of vectorizer generated variables, and appends that to NAME if
4381 provided. */
4382
4383 tree
4384 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4385 {
4386 const char *prefix;
4387 tree new_vect_var;
4388
4389 switch (var_kind)
4390 {
4391 case vect_simple_var:
4392 prefix = "vect";
4393 break;
4394 case vect_scalar_var:
4395 prefix = "stmp";
4396 break;
4397 case vect_mask_var:
4398 prefix = "mask";
4399 break;
4400 case vect_pointer_var:
4401 prefix = "vectp";
4402 break;
4403 default:
4404 gcc_unreachable ();
4405 }
4406
4407 if (name)
4408 {
4409 char* tmp = concat (prefix, "_", name, NULL);
4410 new_vect_var = create_tmp_reg (type, tmp);
4411 free (tmp);
4412 }
4413 else
4414 new_vect_var = create_tmp_reg (type, prefix);
4415
4416 return new_vect_var;
4417 }
4418
4419 /* Like vect_get_new_vect_var but return an SSA name. */
4420
4421 tree
4422 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4423 {
4424 const char *prefix;
4425 tree new_vect_var;
4426
4427 switch (var_kind)
4428 {
4429 case vect_simple_var:
4430 prefix = "vect";
4431 break;
4432 case vect_scalar_var:
4433 prefix = "stmp";
4434 break;
4435 case vect_pointer_var:
4436 prefix = "vectp";
4437 break;
4438 default:
4439 gcc_unreachable ();
4440 }
4441
4442 if (name)
4443 {
4444 char* tmp = concat (prefix, "_", name, NULL);
4445 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4446 free (tmp);
4447 }
4448 else
4449 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4450
4451 return new_vect_var;
4452 }
4453
4454 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4455
4456 static void
4457 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4458 {
4459 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4460 int misalign = DR_MISALIGNMENT (dr_info);
4461 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4462 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4463 else
4464 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4465 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4466 misalign);
4467 }
4468
4469 /* Function vect_create_addr_base_for_vector_ref.
4470
4471 Create an expression that computes the address of the first memory location
4472 that will be accessed for a data reference.
4473
4474 Input:
4475 STMT_INFO: The statement containing the data reference.
4476 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4477 OFFSET: Optional. If supplied, it is be added to the initial address.
4478 LOOP: Specify relative to which loop-nest should the address be computed.
4479 For example, when the dataref is in an inner-loop nested in an
4480 outer-loop that is now being vectorized, LOOP can be either the
4481 outer-loop, or the inner-loop. The first memory location accessed
4482 by the following dataref ('in' points to short):
4483
4484 for (i=0; i<N; i++)
4485 for (j=0; j<M; j++)
4486 s += in[i+j]
4487
4488 is as follows:
4489 if LOOP=i_loop: &in (relative to i_loop)
4490 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4491 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4492 initial address. Unlike OFFSET, which is number of elements to
4493 be added, BYTE_OFFSET is measured in bytes.
4494
4495 Output:
4496 1. Return an SSA_NAME whose value is the address of the memory location of
4497 the first vector of the data reference.
4498 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4499 these statement(s) which define the returned SSA_NAME.
4500
4501 FORNOW: We are only handling array accesses with step 1. */
4502
4503 tree
4504 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4505 gimple_seq *new_stmt_list,
4506 tree offset,
4507 tree byte_offset)
4508 {
4509 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4510 struct data_reference *dr = dr_info->dr;
4511 const char *base_name;
4512 tree addr_base;
4513 tree dest;
4514 gimple_seq seq = NULL;
4515 tree vect_ptr_type;
4516 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4517 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4518 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4519
4520 tree data_ref_base = unshare_expr (drb->base_address);
4521 tree base_offset = unshare_expr (drb->offset);
4522 tree init = unshare_expr (drb->init);
4523
4524 if (loop_vinfo)
4525 base_name = get_name (data_ref_base);
4526 else
4527 {
4528 base_offset = ssize_int (0);
4529 init = ssize_int (0);
4530 base_name = get_name (DR_REF (dr));
4531 }
4532
4533 /* Create base_offset */
4534 base_offset = size_binop (PLUS_EXPR,
4535 fold_convert (sizetype, base_offset),
4536 fold_convert (sizetype, init));
4537
4538 if (offset)
4539 {
4540 offset = fold_build2 (MULT_EXPR, sizetype,
4541 fold_convert (sizetype, offset), step);
4542 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4543 base_offset, offset);
4544 }
4545 if (byte_offset)
4546 {
4547 byte_offset = fold_convert (sizetype, byte_offset);
4548 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4549 base_offset, byte_offset);
4550 }
4551
4552 /* base + base_offset */
4553 if (loop_vinfo)
4554 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4555 else
4556 {
4557 addr_base = build1 (ADDR_EXPR,
4558 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4559 unshare_expr (DR_REF (dr)));
4560 }
4561
4562 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4563 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4564 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4565 gimple_seq_add_seq (new_stmt_list, seq);
4566
4567 if (DR_PTR_INFO (dr)
4568 && TREE_CODE (addr_base) == SSA_NAME
4569 && !SSA_NAME_PTR_INFO (addr_base))
4570 {
4571 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4572 if (offset || byte_offset)
4573 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4574 }
4575
4576 if (dump_enabled_p ())
4577 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4578
4579 return addr_base;
4580 }
4581
4582
4583 /* Function vect_create_data_ref_ptr.
4584
4585 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4586 location accessed in the loop by STMT_INFO, along with the def-use update
4587 chain to appropriately advance the pointer through the loop iterations.
4588 Also set aliasing information for the pointer. This pointer is used by
4589 the callers to this function to create a memory reference expression for
4590 vector load/store access.
4591
4592 Input:
4593 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4594 GIMPLE_ASSIGN <name, data-ref> or
4595 GIMPLE_ASSIGN <data-ref, name>.
4596 2. AGGR_TYPE: the type of the reference, which should be either a vector
4597 or an array.
4598 3. AT_LOOP: the loop where the vector memref is to be created.
4599 4. OFFSET (optional): an offset to be added to the initial address accessed
4600 by the data-ref in STMT_INFO.
4601 5. BSI: location where the new stmts are to be placed if there is no loop
4602 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4603 pointing to the initial address.
4604 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4605 to the initial address accessed by the data-ref in STMT_INFO. This is
4606 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4607 in bytes.
4608 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4609 to the IV during each iteration of the loop. NULL says to move
4610 by one copy of AGGR_TYPE up or down, depending on the step of the
4611 data reference.
4612
4613 Output:
4614 1. Declare a new ptr to vector_type, and have it point to the base of the
4615 data reference (initial addressed accessed by the data reference).
4616 For example, for vector of type V8HI, the following code is generated:
4617
4618 v8hi *ap;
4619 ap = (v8hi *)initial_address;
4620
4621 if OFFSET is not supplied:
4622 initial_address = &a[init];
4623 if OFFSET is supplied:
4624 initial_address = &a[init + OFFSET];
4625 if BYTE_OFFSET is supplied:
4626 initial_address = &a[init] + BYTE_OFFSET;
4627
4628 Return the initial_address in INITIAL_ADDRESS.
4629
4630 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4631 update the pointer in each iteration of the loop.
4632
4633 Return the increment stmt that updates the pointer in PTR_INCR.
4634
4635 3. Return the pointer. */
4636
4637 tree
4638 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4639 struct loop *at_loop, tree offset,
4640 tree *initial_address, gimple_stmt_iterator *gsi,
4641 gimple **ptr_incr, bool only_init,
4642 tree byte_offset, tree iv_step)
4643 {
4644 const char *base_name;
4645 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4646 struct loop *loop = NULL;
4647 bool nested_in_vect_loop = false;
4648 struct loop *containing_loop = NULL;
4649 tree aggr_ptr_type;
4650 tree aggr_ptr;
4651 tree new_temp;
4652 gimple_seq new_stmt_list = NULL;
4653 edge pe = NULL;
4654 basic_block new_bb;
4655 tree aggr_ptr_init;
4656 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4657 struct data_reference *dr = dr_info->dr;
4658 tree aptr;
4659 gimple_stmt_iterator incr_gsi;
4660 bool insert_after;
4661 tree indx_before_incr, indx_after_incr;
4662 gimple *incr;
4663 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4664
4665 gcc_assert (iv_step != NULL_TREE
4666 || TREE_CODE (aggr_type) == ARRAY_TYPE
4667 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4668
4669 if (loop_vinfo)
4670 {
4671 loop = LOOP_VINFO_LOOP (loop_vinfo);
4672 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4673 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4674 pe = loop_preheader_edge (loop);
4675 }
4676 else
4677 {
4678 gcc_assert (bb_vinfo);
4679 only_init = true;
4680 *ptr_incr = NULL;
4681 }
4682
4683 /* Create an expression for the first address accessed by this load
4684 in LOOP. */
4685 base_name = get_name (DR_BASE_ADDRESS (dr));
4686
4687 if (dump_enabled_p ())
4688 {
4689 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4690 dump_printf_loc (MSG_NOTE, vect_location,
4691 "create %s-pointer variable to type: %T",
4692 get_tree_code_name (TREE_CODE (aggr_type)),
4693 aggr_type);
4694 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4695 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4696 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4697 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4698 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4699 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4700 else
4701 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4702 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4703 }
4704
4705 /* (1) Create the new aggregate-pointer variable.
4706 Vector and array types inherit the alias set of their component
4707 type by default so we need to use a ref-all pointer if the data
4708 reference does not conflict with the created aggregated data
4709 reference because it is not addressable. */
4710 bool need_ref_all = false;
4711 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4712 get_alias_set (DR_REF (dr))))
4713 need_ref_all = true;
4714 /* Likewise for any of the data references in the stmt group. */
4715 else if (DR_GROUP_SIZE (stmt_info) > 1)
4716 {
4717 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4718 do
4719 {
4720 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4721 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4722 get_alias_set (DR_REF (sdr))))
4723 {
4724 need_ref_all = true;
4725 break;
4726 }
4727 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4728 }
4729 while (sinfo);
4730 }
4731 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4732 need_ref_all);
4733 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4734
4735
4736 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4737 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4738 def-use update cycles for the pointer: one relative to the outer-loop
4739 (LOOP), which is what steps (3) and (4) below do. The other is relative
4740 to the inner-loop (which is the inner-most loop containing the dataref),
4741 and this is done be step (5) below.
4742
4743 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4744 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4745 redundant. Steps (3),(4) create the following:
4746
4747 vp0 = &base_addr;
4748 LOOP: vp1 = phi(vp0,vp2)
4749 ...
4750 ...
4751 vp2 = vp1 + step
4752 goto LOOP
4753
4754 If there is an inner-loop nested in loop, then step (5) will also be
4755 applied, and an additional update in the inner-loop will be created:
4756
4757 vp0 = &base_addr;
4758 LOOP: vp1 = phi(vp0,vp2)
4759 ...
4760 inner: vp3 = phi(vp1,vp4)
4761 vp4 = vp3 + inner_step
4762 if () goto inner
4763 ...
4764 vp2 = vp1 + step
4765 if () goto LOOP */
4766
4767 /* (2) Calculate the initial address of the aggregate-pointer, and set
4768 the aggregate-pointer to point to it before the loop. */
4769
4770 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4771
4772 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4773 offset, byte_offset);
4774 if (new_stmt_list)
4775 {
4776 if (pe)
4777 {
4778 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4779 gcc_assert (!new_bb);
4780 }
4781 else
4782 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4783 }
4784
4785 *initial_address = new_temp;
4786 aggr_ptr_init = new_temp;
4787
4788 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4789 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4790 inner-loop nested in LOOP (during outer-loop vectorization). */
4791
4792 /* No update in loop is required. */
4793 if (only_init && (!loop_vinfo || at_loop == loop))
4794 aptr = aggr_ptr_init;
4795 else
4796 {
4797 /* Accesses to invariant addresses should be handled specially
4798 by the caller. */
4799 tree step = vect_dr_behavior (dr_info)->step;
4800 gcc_assert (!integer_zerop (step));
4801
4802 if (iv_step == NULL_TREE)
4803 {
4804 /* The step of the aggregate pointer is the type size,
4805 negated for downward accesses. */
4806 iv_step = TYPE_SIZE_UNIT (aggr_type);
4807 if (tree_int_cst_sgn (step) == -1)
4808 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4809 }
4810
4811 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4812
4813 create_iv (aggr_ptr_init,
4814 fold_convert (aggr_ptr_type, iv_step),
4815 aggr_ptr, loop, &incr_gsi, insert_after,
4816 &indx_before_incr, &indx_after_incr);
4817 incr = gsi_stmt (incr_gsi);
4818 loop_vinfo->add_stmt (incr);
4819
4820 /* Copy the points-to information if it exists. */
4821 if (DR_PTR_INFO (dr))
4822 {
4823 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4824 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4825 }
4826 if (ptr_incr)
4827 *ptr_incr = incr;
4828
4829 aptr = indx_before_incr;
4830 }
4831
4832 if (!nested_in_vect_loop || only_init)
4833 return aptr;
4834
4835
4836 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4837 nested in LOOP, if exists. */
4838
4839 gcc_assert (nested_in_vect_loop);
4840 if (!only_init)
4841 {
4842 standard_iv_increment_position (containing_loop, &incr_gsi,
4843 &insert_after);
4844 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4845 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4846 &indx_after_incr);
4847 incr = gsi_stmt (incr_gsi);
4848 loop_vinfo->add_stmt (incr);
4849
4850 /* Copy the points-to information if it exists. */
4851 if (DR_PTR_INFO (dr))
4852 {
4853 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4854 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4855 }
4856 if (ptr_incr)
4857 *ptr_incr = incr;
4858
4859 return indx_before_incr;
4860 }
4861 else
4862 gcc_unreachable ();
4863 }
4864
4865
4866 /* Function bump_vector_ptr
4867
4868 Increment a pointer (to a vector type) by vector-size. If requested,
4869 i.e. if PTR-INCR is given, then also connect the new increment stmt
4870 to the existing def-use update-chain of the pointer, by modifying
4871 the PTR_INCR as illustrated below:
4872
4873 The pointer def-use update-chain before this function:
4874 DATAREF_PTR = phi (p_0, p_2)
4875 ....
4876 PTR_INCR: p_2 = DATAREF_PTR + step
4877
4878 The pointer def-use update-chain after this function:
4879 DATAREF_PTR = phi (p_0, p_2)
4880 ....
4881 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4882 ....
4883 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4884
4885 Input:
4886 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4887 in the loop.
4888 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4889 the loop. The increment amount across iterations is expected
4890 to be vector_size.
4891 BSI - location where the new update stmt is to be placed.
4892 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4893 BUMP - optional. The offset by which to bump the pointer. If not given,
4894 the offset is assumed to be vector_size.
4895
4896 Output: Return NEW_DATAREF_PTR as illustrated above.
4897
4898 */
4899
4900 tree
4901 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4902 stmt_vec_info stmt_info, tree bump)
4903 {
4904 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4905 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4906 tree update = TYPE_SIZE_UNIT (vectype);
4907 gassign *incr_stmt;
4908 ssa_op_iter iter;
4909 use_operand_p use_p;
4910 tree new_dataref_ptr;
4911
4912 if (bump)
4913 update = bump;
4914
4915 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4916 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4917 else
4918 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4919 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4920 dataref_ptr, update);
4921 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4922
4923 /* Copy the points-to information if it exists. */
4924 if (DR_PTR_INFO (dr))
4925 {
4926 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4927 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4928 }
4929
4930 if (!ptr_incr)
4931 return new_dataref_ptr;
4932
4933 /* Update the vector-pointer's cross-iteration increment. */
4934 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4935 {
4936 tree use = USE_FROM_PTR (use_p);
4937
4938 if (use == dataref_ptr)
4939 SET_USE (use_p, new_dataref_ptr);
4940 else
4941 gcc_assert (operand_equal_p (use, update, 0));
4942 }
4943
4944 return new_dataref_ptr;
4945 }
4946
4947
4948 /* Copy memory reference info such as base/clique from the SRC reference
4949 to the DEST MEM_REF. */
4950
4951 void
4952 vect_copy_ref_info (tree dest, tree src)
4953 {
4954 if (TREE_CODE (dest) != MEM_REF)
4955 return;
4956
4957 tree src_base = src;
4958 while (handled_component_p (src_base))
4959 src_base = TREE_OPERAND (src_base, 0);
4960 if (TREE_CODE (src_base) != MEM_REF
4961 && TREE_CODE (src_base) != TARGET_MEM_REF)
4962 return;
4963
4964 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4965 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4966 }
4967
4968
4969 /* Function vect_create_destination_var.
4970
4971 Create a new temporary of type VECTYPE. */
4972
4973 tree
4974 vect_create_destination_var (tree scalar_dest, tree vectype)
4975 {
4976 tree vec_dest;
4977 const char *name;
4978 char *new_name;
4979 tree type;
4980 enum vect_var_kind kind;
4981
4982 kind = vectype
4983 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4984 ? vect_mask_var
4985 : vect_simple_var
4986 : vect_scalar_var;
4987 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4988
4989 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4990
4991 name = get_name (scalar_dest);
4992 if (name)
4993 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4994 else
4995 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4996 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4997 free (new_name);
4998
4999 return vec_dest;
5000 }
5001
5002 /* Function vect_grouped_store_supported.
5003
5004 Returns TRUE if interleave high and interleave low permutations
5005 are supported, and FALSE otherwise. */
5006
5007 bool
5008 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5009 {
5010 machine_mode mode = TYPE_MODE (vectype);
5011
5012 /* vect_permute_store_chain requires the group size to be equal to 3 or
5013 be a power of two. */
5014 if (count != 3 && exact_log2 (count) == -1)
5015 {
5016 if (dump_enabled_p ())
5017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018 "the size of the group of accesses"
5019 " is not a power of 2 or not eqaul to 3\n");
5020 return false;
5021 }
5022
5023 /* Check that the permutation is supported. */
5024 if (VECTOR_MODE_P (mode))
5025 {
5026 unsigned int i;
5027 if (count == 3)
5028 {
5029 unsigned int j0 = 0, j1 = 0, j2 = 0;
5030 unsigned int i, j;
5031
5032 unsigned int nelt;
5033 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5034 {
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037 "cannot handle groups of 3 stores for"
5038 " variable-length vectors\n");
5039 return false;
5040 }
5041
5042 vec_perm_builder sel (nelt, nelt, 1);
5043 sel.quick_grow (nelt);
5044 vec_perm_indices indices;
5045 for (j = 0; j < 3; j++)
5046 {
5047 int nelt0 = ((3 - j) * nelt) % 3;
5048 int nelt1 = ((3 - j) * nelt + 1) % 3;
5049 int nelt2 = ((3 - j) * nelt + 2) % 3;
5050 for (i = 0; i < nelt; i++)
5051 {
5052 if (3 * i + nelt0 < nelt)
5053 sel[3 * i + nelt0] = j0++;
5054 if (3 * i + nelt1 < nelt)
5055 sel[3 * i + nelt1] = nelt + j1++;
5056 if (3 * i + nelt2 < nelt)
5057 sel[3 * i + nelt2] = 0;
5058 }
5059 indices.new_vector (sel, 2, nelt);
5060 if (!can_vec_perm_const_p (mode, indices))
5061 {
5062 if (dump_enabled_p ())
5063 dump_printf (MSG_MISSED_OPTIMIZATION,
5064 "permutation op not supported by target.\n");
5065 return false;
5066 }
5067
5068 for (i = 0; i < nelt; i++)
5069 {
5070 if (3 * i + nelt0 < nelt)
5071 sel[3 * i + nelt0] = 3 * i + nelt0;
5072 if (3 * i + nelt1 < nelt)
5073 sel[3 * i + nelt1] = 3 * i + nelt1;
5074 if (3 * i + nelt2 < nelt)
5075 sel[3 * i + nelt2] = nelt + j2++;
5076 }
5077 indices.new_vector (sel, 2, nelt);
5078 if (!can_vec_perm_const_p (mode, indices))
5079 {
5080 if (dump_enabled_p ())
5081 dump_printf (MSG_MISSED_OPTIMIZATION,
5082 "permutation op not supported by target.\n");
5083 return false;
5084 }
5085 }
5086 return true;
5087 }
5088 else
5089 {
5090 /* If length is not equal to 3 then only power of 2 is supported. */
5091 gcc_assert (pow2p_hwi (count));
5092 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5093
5094 /* The encoding has 2 interleaved stepped patterns. */
5095 vec_perm_builder sel (nelt, 2, 3);
5096 sel.quick_grow (6);
5097 for (i = 0; i < 3; i++)
5098 {
5099 sel[i * 2] = i;
5100 sel[i * 2 + 1] = i + nelt;
5101 }
5102 vec_perm_indices indices (sel, 2, nelt);
5103 if (can_vec_perm_const_p (mode, indices))
5104 {
5105 for (i = 0; i < 6; i++)
5106 sel[i] += exact_div (nelt, 2);
5107 indices.new_vector (sel, 2, nelt);
5108 if (can_vec_perm_const_p (mode, indices))
5109 return true;
5110 }
5111 }
5112 }
5113
5114 if (dump_enabled_p ())
5115 dump_printf (MSG_MISSED_OPTIMIZATION,
5116 "permutation op not supported by target.\n");
5117 return false;
5118 }
5119
5120
5121 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5122 type VECTYPE. MASKED_P says whether the masked form is needed. */
5123
5124 bool
5125 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5126 bool masked_p)
5127 {
5128 if (masked_p)
5129 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5130 vec_mask_store_lanes_optab,
5131 vectype, count);
5132 else
5133 return vect_lanes_optab_supported_p ("vec_store_lanes",
5134 vec_store_lanes_optab,
5135 vectype, count);
5136 }
5137
5138
5139 /* Function vect_permute_store_chain.
5140
5141 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5142 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5143 the data correctly for the stores. Return the final references for stores
5144 in RESULT_CHAIN.
5145
5146 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5147 The input is 4 vectors each containing 8 elements. We assign a number to
5148 each element, the input sequence is:
5149
5150 1st vec: 0 1 2 3 4 5 6 7
5151 2nd vec: 8 9 10 11 12 13 14 15
5152 3rd vec: 16 17 18 19 20 21 22 23
5153 4th vec: 24 25 26 27 28 29 30 31
5154
5155 The output sequence should be:
5156
5157 1st vec: 0 8 16 24 1 9 17 25
5158 2nd vec: 2 10 18 26 3 11 19 27
5159 3rd vec: 4 12 20 28 5 13 21 30
5160 4th vec: 6 14 22 30 7 15 23 31
5161
5162 i.e., we interleave the contents of the four vectors in their order.
5163
5164 We use interleave_high/low instructions to create such output. The input of
5165 each interleave_high/low operation is two vectors:
5166 1st vec 2nd vec
5167 0 1 2 3 4 5 6 7
5168 the even elements of the result vector are obtained left-to-right from the
5169 high/low elements of the first vector. The odd elements of the result are
5170 obtained left-to-right from the high/low elements of the second vector.
5171 The output of interleave_high will be: 0 4 1 5
5172 and of interleave_low: 2 6 3 7
5173
5174
5175 The permutation is done in log LENGTH stages. In each stage interleave_high
5176 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5177 where the first argument is taken from the first half of DR_CHAIN and the
5178 second argument from it's second half.
5179 In our example,
5180
5181 I1: interleave_high (1st vec, 3rd vec)
5182 I2: interleave_low (1st vec, 3rd vec)
5183 I3: interleave_high (2nd vec, 4th vec)
5184 I4: interleave_low (2nd vec, 4th vec)
5185
5186 The output for the first stage is:
5187
5188 I1: 0 16 1 17 2 18 3 19
5189 I2: 4 20 5 21 6 22 7 23
5190 I3: 8 24 9 25 10 26 11 27
5191 I4: 12 28 13 29 14 30 15 31
5192
5193 The output of the second stage, i.e. the final result is:
5194
5195 I1: 0 8 16 24 1 9 17 25
5196 I2: 2 10 18 26 3 11 19 27
5197 I3: 4 12 20 28 5 13 21 30
5198 I4: 6 14 22 30 7 15 23 31. */
5199
5200 void
5201 vect_permute_store_chain (vec<tree> dr_chain,
5202 unsigned int length,
5203 stmt_vec_info stmt_info,
5204 gimple_stmt_iterator *gsi,
5205 vec<tree> *result_chain)
5206 {
5207 tree vect1, vect2, high, low;
5208 gimple *perm_stmt;
5209 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5210 tree perm_mask_low, perm_mask_high;
5211 tree data_ref;
5212 tree perm3_mask_low, perm3_mask_high;
5213 unsigned int i, j, n, log_length = exact_log2 (length);
5214
5215 result_chain->quick_grow (length);
5216 memcpy (result_chain->address (), dr_chain.address (),
5217 length * sizeof (tree));
5218
5219 if (length == 3)
5220 {
5221 /* vect_grouped_store_supported ensures that this is constant. */
5222 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5223 unsigned int j0 = 0, j1 = 0, j2 = 0;
5224
5225 vec_perm_builder sel (nelt, nelt, 1);
5226 sel.quick_grow (nelt);
5227 vec_perm_indices indices;
5228 for (j = 0; j < 3; j++)
5229 {
5230 int nelt0 = ((3 - j) * nelt) % 3;
5231 int nelt1 = ((3 - j) * nelt + 1) % 3;
5232 int nelt2 = ((3 - j) * nelt + 2) % 3;
5233
5234 for (i = 0; i < nelt; i++)
5235 {
5236 if (3 * i + nelt0 < nelt)
5237 sel[3 * i + nelt0] = j0++;
5238 if (3 * i + nelt1 < nelt)
5239 sel[3 * i + nelt1] = nelt + j1++;
5240 if (3 * i + nelt2 < nelt)
5241 sel[3 * i + nelt2] = 0;
5242 }
5243 indices.new_vector (sel, 2, nelt);
5244 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5245
5246 for (i = 0; i < nelt; i++)
5247 {
5248 if (3 * i + nelt0 < nelt)
5249 sel[3 * i + nelt0] = 3 * i + nelt0;
5250 if (3 * i + nelt1 < nelt)
5251 sel[3 * i + nelt1] = 3 * i + nelt1;
5252 if (3 * i + nelt2 < nelt)
5253 sel[3 * i + nelt2] = nelt + j2++;
5254 }
5255 indices.new_vector (sel, 2, nelt);
5256 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5257
5258 vect1 = dr_chain[0];
5259 vect2 = dr_chain[1];
5260
5261 /* Create interleaving stmt:
5262 low = VEC_PERM_EXPR <vect1, vect2,
5263 {j, nelt, *, j + 1, nelt + j + 1, *,
5264 j + 2, nelt + j + 2, *, ...}> */
5265 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5266 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5267 vect2, perm3_mask_low);
5268 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5269
5270 vect1 = data_ref;
5271 vect2 = dr_chain[2];
5272 /* Create interleaving stmt:
5273 low = VEC_PERM_EXPR <vect1, vect2,
5274 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5275 6, 7, nelt + j + 2, ...}> */
5276 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5277 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5278 vect2, perm3_mask_high);
5279 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5280 (*result_chain)[j] = data_ref;
5281 }
5282 }
5283 else
5284 {
5285 /* If length is not equal to 3 then only power of 2 is supported. */
5286 gcc_assert (pow2p_hwi (length));
5287
5288 /* The encoding has 2 interleaved stepped patterns. */
5289 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5290 vec_perm_builder sel (nelt, 2, 3);
5291 sel.quick_grow (6);
5292 for (i = 0; i < 3; i++)
5293 {
5294 sel[i * 2] = i;
5295 sel[i * 2 + 1] = i + nelt;
5296 }
5297 vec_perm_indices indices (sel, 2, nelt);
5298 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5299
5300 for (i = 0; i < 6; i++)
5301 sel[i] += exact_div (nelt, 2);
5302 indices.new_vector (sel, 2, nelt);
5303 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5304
5305 for (i = 0, n = log_length; i < n; i++)
5306 {
5307 for (j = 0; j < length/2; j++)
5308 {
5309 vect1 = dr_chain[j];
5310 vect2 = dr_chain[j+length/2];
5311
5312 /* Create interleaving stmt:
5313 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5314 ...}> */
5315 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5316 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5317 vect2, perm_mask_high);
5318 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5319 (*result_chain)[2*j] = high;
5320
5321 /* Create interleaving stmt:
5322 low = VEC_PERM_EXPR <vect1, vect2,
5323 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5324 ...}> */
5325 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5326 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5327 vect2, perm_mask_low);
5328 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5329 (*result_chain)[2*j+1] = low;
5330 }
5331 memcpy (dr_chain.address (), result_chain->address (),
5332 length * sizeof (tree));
5333 }
5334 }
5335 }
5336
5337 /* Function vect_setup_realignment
5338
5339 This function is called when vectorizing an unaligned load using
5340 the dr_explicit_realign[_optimized] scheme.
5341 This function generates the following code at the loop prolog:
5342
5343 p = initial_addr;
5344 x msq_init = *(floor(p)); # prolog load
5345 realignment_token = call target_builtin;
5346 loop:
5347 x msq = phi (msq_init, ---)
5348
5349 The stmts marked with x are generated only for the case of
5350 dr_explicit_realign_optimized.
5351
5352 The code above sets up a new (vector) pointer, pointing to the first
5353 location accessed by STMT_INFO, and a "floor-aligned" load using that
5354 pointer. It also generates code to compute the "realignment-token"
5355 (if the relevant target hook was defined), and creates a phi-node at the
5356 loop-header bb whose arguments are the result of the prolog-load (created
5357 by this function) and the result of a load that takes place in the loop
5358 (to be created by the caller to this function).
5359
5360 For the case of dr_explicit_realign_optimized:
5361 The caller to this function uses the phi-result (msq) to create the
5362 realignment code inside the loop, and sets up the missing phi argument,
5363 as follows:
5364 loop:
5365 msq = phi (msq_init, lsq)
5366 lsq = *(floor(p')); # load in loop
5367 result = realign_load (msq, lsq, realignment_token);
5368
5369 For the case of dr_explicit_realign:
5370 loop:
5371 msq = *(floor(p)); # load in loop
5372 p' = p + (VS-1);
5373 lsq = *(floor(p')); # load in loop
5374 result = realign_load (msq, lsq, realignment_token);
5375
5376 Input:
5377 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5378 a memory location that may be unaligned.
5379 BSI - place where new code is to be inserted.
5380 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5381 is used.
5382
5383 Output:
5384 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5385 target hook, if defined.
5386 Return value - the result of the loop-header phi node. */
5387
5388 tree
5389 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5390 tree *realignment_token,
5391 enum dr_alignment_support alignment_support_scheme,
5392 tree init_addr,
5393 struct loop **at_loop)
5394 {
5395 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5397 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5398 struct data_reference *dr = dr_info->dr;
5399 struct loop *loop = NULL;
5400 edge pe = NULL;
5401 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5402 tree vec_dest;
5403 gimple *inc;
5404 tree ptr;
5405 tree data_ref;
5406 basic_block new_bb;
5407 tree msq_init = NULL_TREE;
5408 tree new_temp;
5409 gphi *phi_stmt;
5410 tree msq = NULL_TREE;
5411 gimple_seq stmts = NULL;
5412 bool compute_in_loop = false;
5413 bool nested_in_vect_loop = false;
5414 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5415 struct loop *loop_for_initial_load = NULL;
5416
5417 if (loop_vinfo)
5418 {
5419 loop = LOOP_VINFO_LOOP (loop_vinfo);
5420 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5421 }
5422
5423 gcc_assert (alignment_support_scheme == dr_explicit_realign
5424 || alignment_support_scheme == dr_explicit_realign_optimized);
5425
5426 /* We need to generate three things:
5427 1. the misalignment computation
5428 2. the extra vector load (for the optimized realignment scheme).
5429 3. the phi node for the two vectors from which the realignment is
5430 done (for the optimized realignment scheme). */
5431
5432 /* 1. Determine where to generate the misalignment computation.
5433
5434 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5435 calculation will be generated by this function, outside the loop (in the
5436 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5437 caller, inside the loop.
5438
5439 Background: If the misalignment remains fixed throughout the iterations of
5440 the loop, then both realignment schemes are applicable, and also the
5441 misalignment computation can be done outside LOOP. This is because we are
5442 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5443 are a multiple of VS (the Vector Size), and therefore the misalignment in
5444 different vectorized LOOP iterations is always the same.
5445 The problem arises only if the memory access is in an inner-loop nested
5446 inside LOOP, which is now being vectorized using outer-loop vectorization.
5447 This is the only case when the misalignment of the memory access may not
5448 remain fixed throughout the iterations of the inner-loop (as explained in
5449 detail in vect_supportable_dr_alignment). In this case, not only is the
5450 optimized realignment scheme not applicable, but also the misalignment
5451 computation (and generation of the realignment token that is passed to
5452 REALIGN_LOAD) have to be done inside the loop.
5453
5454 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5455 or not, which in turn determines if the misalignment is computed inside
5456 the inner-loop, or outside LOOP. */
5457
5458 if (init_addr != NULL_TREE || !loop_vinfo)
5459 {
5460 compute_in_loop = true;
5461 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5462 }
5463
5464
5465 /* 2. Determine where to generate the extra vector load.
5466
5467 For the optimized realignment scheme, instead of generating two vector
5468 loads in each iteration, we generate a single extra vector load in the
5469 preheader of the loop, and in each iteration reuse the result of the
5470 vector load from the previous iteration. In case the memory access is in
5471 an inner-loop nested inside LOOP, which is now being vectorized using
5472 outer-loop vectorization, we need to determine whether this initial vector
5473 load should be generated at the preheader of the inner-loop, or can be
5474 generated at the preheader of LOOP. If the memory access has no evolution
5475 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5476 to be generated inside LOOP (in the preheader of the inner-loop). */
5477
5478 if (nested_in_vect_loop)
5479 {
5480 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5481 bool invariant_in_outerloop =
5482 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5483 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5484 }
5485 else
5486 loop_for_initial_load = loop;
5487 if (at_loop)
5488 *at_loop = loop_for_initial_load;
5489
5490 if (loop_for_initial_load)
5491 pe = loop_preheader_edge (loop_for_initial_load);
5492
5493 /* 3. For the case of the optimized realignment, create the first vector
5494 load at the loop preheader. */
5495
5496 if (alignment_support_scheme == dr_explicit_realign_optimized)
5497 {
5498 /* Create msq_init = *(floor(p1)) in the loop preheader */
5499 gassign *new_stmt;
5500
5501 gcc_assert (!compute_in_loop);
5502 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5503 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5504 loop_for_initial_load, NULL_TREE,
5505 &init_addr, NULL, &inc, true);
5506 if (TREE_CODE (ptr) == SSA_NAME)
5507 new_temp = copy_ssa_name (ptr);
5508 else
5509 new_temp = make_ssa_name (TREE_TYPE (ptr));
5510 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5511 tree type = TREE_TYPE (ptr);
5512 new_stmt = gimple_build_assign
5513 (new_temp, BIT_AND_EXPR, ptr,
5514 fold_build2 (MINUS_EXPR, type,
5515 build_int_cst (type, 0),
5516 build_int_cst (type, align)));
5517 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5518 gcc_assert (!new_bb);
5519 data_ref
5520 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5521 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5522 vect_copy_ref_info (data_ref, DR_REF (dr));
5523 new_stmt = gimple_build_assign (vec_dest, data_ref);
5524 new_temp = make_ssa_name (vec_dest, new_stmt);
5525 gimple_assign_set_lhs (new_stmt, new_temp);
5526 if (pe)
5527 {
5528 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5529 gcc_assert (!new_bb);
5530 }
5531 else
5532 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5533
5534 msq_init = gimple_assign_lhs (new_stmt);
5535 }
5536
5537 /* 4. Create realignment token using a target builtin, if available.
5538 It is done either inside the containing loop, or before LOOP (as
5539 determined above). */
5540
5541 if (targetm.vectorize.builtin_mask_for_load)
5542 {
5543 gcall *new_stmt;
5544 tree builtin_decl;
5545
5546 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5547 if (!init_addr)
5548 {
5549 /* Generate the INIT_ADDR computation outside LOOP. */
5550 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5551 NULL_TREE);
5552 if (loop)
5553 {
5554 pe = loop_preheader_edge (loop);
5555 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5556 gcc_assert (!new_bb);
5557 }
5558 else
5559 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5560 }
5561
5562 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5563 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5564 vec_dest =
5565 vect_create_destination_var (scalar_dest,
5566 gimple_call_return_type (new_stmt));
5567 new_temp = make_ssa_name (vec_dest, new_stmt);
5568 gimple_call_set_lhs (new_stmt, new_temp);
5569
5570 if (compute_in_loop)
5571 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5572 else
5573 {
5574 /* Generate the misalignment computation outside LOOP. */
5575 pe = loop_preheader_edge (loop);
5576 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5577 gcc_assert (!new_bb);
5578 }
5579
5580 *realignment_token = gimple_call_lhs (new_stmt);
5581
5582 /* The result of the CALL_EXPR to this builtin is determined from
5583 the value of the parameter and no global variables are touched
5584 which makes the builtin a "const" function. Requiring the
5585 builtin to have the "const" attribute makes it unnecessary
5586 to call mark_call_clobbered. */
5587 gcc_assert (TREE_READONLY (builtin_decl));
5588 }
5589
5590 if (alignment_support_scheme == dr_explicit_realign)
5591 return msq;
5592
5593 gcc_assert (!compute_in_loop);
5594 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5595
5596
5597 /* 5. Create msq = phi <msq_init, lsq> in loop */
5598
5599 pe = loop_preheader_edge (containing_loop);
5600 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5601 msq = make_ssa_name (vec_dest);
5602 phi_stmt = create_phi_node (msq, containing_loop->header);
5603 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5604
5605 return msq;
5606 }
5607
5608
5609 /* Function vect_grouped_load_supported.
5610
5611 COUNT is the size of the load group (the number of statements plus the
5612 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5613 only one statement, with a gap of COUNT - 1.
5614
5615 Returns true if a suitable permute exists. */
5616
5617 bool
5618 vect_grouped_load_supported (tree vectype, bool single_element_p,
5619 unsigned HOST_WIDE_INT count)
5620 {
5621 machine_mode mode = TYPE_MODE (vectype);
5622
5623 /* If this is single-element interleaving with an element distance
5624 that leaves unused vector loads around punt - we at least create
5625 very sub-optimal code in that case (and blow up memory,
5626 see PR65518). */
5627 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5628 {
5629 if (dump_enabled_p ())
5630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631 "single-element interleaving not supported "
5632 "for not adjacent vector loads\n");
5633 return false;
5634 }
5635
5636 /* vect_permute_load_chain requires the group size to be equal to 3 or
5637 be a power of two. */
5638 if (count != 3 && exact_log2 (count) == -1)
5639 {
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642 "the size of the group of accesses"
5643 " is not a power of 2 or not equal to 3\n");
5644 return false;
5645 }
5646
5647 /* Check that the permutation is supported. */
5648 if (VECTOR_MODE_P (mode))
5649 {
5650 unsigned int i, j;
5651 if (count == 3)
5652 {
5653 unsigned int nelt;
5654 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5655 {
5656 if (dump_enabled_p ())
5657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5658 "cannot handle groups of 3 loads for"
5659 " variable-length vectors\n");
5660 return false;
5661 }
5662
5663 vec_perm_builder sel (nelt, nelt, 1);
5664 sel.quick_grow (nelt);
5665 vec_perm_indices indices;
5666 unsigned int k;
5667 for (k = 0; k < 3; k++)
5668 {
5669 for (i = 0; i < nelt; i++)
5670 if (3 * i + k < 2 * nelt)
5671 sel[i] = 3 * i + k;
5672 else
5673 sel[i] = 0;
5674 indices.new_vector (sel, 2, nelt);
5675 if (!can_vec_perm_const_p (mode, indices))
5676 {
5677 if (dump_enabled_p ())
5678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679 "shuffle of 3 loads is not supported by"
5680 " target\n");
5681 return false;
5682 }
5683 for (i = 0, j = 0; i < nelt; i++)
5684 if (3 * i + k < 2 * nelt)
5685 sel[i] = i;
5686 else
5687 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5688 indices.new_vector (sel, 2, nelt);
5689 if (!can_vec_perm_const_p (mode, indices))
5690 {
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693 "shuffle of 3 loads is not supported by"
5694 " target\n");
5695 return false;
5696 }
5697 }
5698 return true;
5699 }
5700 else
5701 {
5702 /* If length is not equal to 3 then only power of 2 is supported. */
5703 gcc_assert (pow2p_hwi (count));
5704 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5705
5706 /* The encoding has a single stepped pattern. */
5707 vec_perm_builder sel (nelt, 1, 3);
5708 sel.quick_grow (3);
5709 for (i = 0; i < 3; i++)
5710 sel[i] = i * 2;
5711 vec_perm_indices indices (sel, 2, nelt);
5712 if (can_vec_perm_const_p (mode, indices))
5713 {
5714 for (i = 0; i < 3; i++)
5715 sel[i] = i * 2 + 1;
5716 indices.new_vector (sel, 2, nelt);
5717 if (can_vec_perm_const_p (mode, indices))
5718 return true;
5719 }
5720 }
5721 }
5722
5723 if (dump_enabled_p ())
5724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5725 "extract even/odd not supported by target\n");
5726 return false;
5727 }
5728
5729 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5730 type VECTYPE. MASKED_P says whether the masked form is needed. */
5731
5732 bool
5733 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5734 bool masked_p)
5735 {
5736 if (masked_p)
5737 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5738 vec_mask_load_lanes_optab,
5739 vectype, count);
5740 else
5741 return vect_lanes_optab_supported_p ("vec_load_lanes",
5742 vec_load_lanes_optab,
5743 vectype, count);
5744 }
5745
5746 /* Function vect_permute_load_chain.
5747
5748 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5749 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5750 the input data correctly. Return the final references for loads in
5751 RESULT_CHAIN.
5752
5753 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5754 The input is 4 vectors each containing 8 elements. We assign a number to each
5755 element, the input sequence is:
5756
5757 1st vec: 0 1 2 3 4 5 6 7
5758 2nd vec: 8 9 10 11 12 13 14 15
5759 3rd vec: 16 17 18 19 20 21 22 23
5760 4th vec: 24 25 26 27 28 29 30 31
5761
5762 The output sequence should be:
5763
5764 1st vec: 0 4 8 12 16 20 24 28
5765 2nd vec: 1 5 9 13 17 21 25 29
5766 3rd vec: 2 6 10 14 18 22 26 30
5767 4th vec: 3 7 11 15 19 23 27 31
5768
5769 i.e., the first output vector should contain the first elements of each
5770 interleaving group, etc.
5771
5772 We use extract_even/odd instructions to create such output. The input of
5773 each extract_even/odd operation is two vectors
5774 1st vec 2nd vec
5775 0 1 2 3 4 5 6 7
5776
5777 and the output is the vector of extracted even/odd elements. The output of
5778 extract_even will be: 0 2 4 6
5779 and of extract_odd: 1 3 5 7
5780
5781
5782 The permutation is done in log LENGTH stages. In each stage extract_even
5783 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5784 their order. In our example,
5785
5786 E1: extract_even (1st vec, 2nd vec)
5787 E2: extract_odd (1st vec, 2nd vec)
5788 E3: extract_even (3rd vec, 4th vec)
5789 E4: extract_odd (3rd vec, 4th vec)
5790
5791 The output for the first stage will be:
5792
5793 E1: 0 2 4 6 8 10 12 14
5794 E2: 1 3 5 7 9 11 13 15
5795 E3: 16 18 20 22 24 26 28 30
5796 E4: 17 19 21 23 25 27 29 31
5797
5798 In order to proceed and create the correct sequence for the next stage (or
5799 for the correct output, if the second stage is the last one, as in our
5800 example), we first put the output of extract_even operation and then the
5801 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5802 The input for the second stage is:
5803
5804 1st vec (E1): 0 2 4 6 8 10 12 14
5805 2nd vec (E3): 16 18 20 22 24 26 28 30
5806 3rd vec (E2): 1 3 5 7 9 11 13 15
5807 4th vec (E4): 17 19 21 23 25 27 29 31
5808
5809 The output of the second stage:
5810
5811 E1: 0 4 8 12 16 20 24 28
5812 E2: 2 6 10 14 18 22 26 30
5813 E3: 1 5 9 13 17 21 25 29
5814 E4: 3 7 11 15 19 23 27 31
5815
5816 And RESULT_CHAIN after reordering:
5817
5818 1st vec (E1): 0 4 8 12 16 20 24 28
5819 2nd vec (E3): 1 5 9 13 17 21 25 29
5820 3rd vec (E2): 2 6 10 14 18 22 26 30
5821 4th vec (E4): 3 7 11 15 19 23 27 31. */
5822
5823 static void
5824 vect_permute_load_chain (vec<tree> dr_chain,
5825 unsigned int length,
5826 stmt_vec_info stmt_info,
5827 gimple_stmt_iterator *gsi,
5828 vec<tree> *result_chain)
5829 {
5830 tree data_ref, first_vect, second_vect;
5831 tree perm_mask_even, perm_mask_odd;
5832 tree perm3_mask_low, perm3_mask_high;
5833 gimple *perm_stmt;
5834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5835 unsigned int i, j, log_length = exact_log2 (length);
5836
5837 result_chain->quick_grow (length);
5838 memcpy (result_chain->address (), dr_chain.address (),
5839 length * sizeof (tree));
5840
5841 if (length == 3)
5842 {
5843 /* vect_grouped_load_supported ensures that this is constant. */
5844 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5845 unsigned int k;
5846
5847 vec_perm_builder sel (nelt, nelt, 1);
5848 sel.quick_grow (nelt);
5849 vec_perm_indices indices;
5850 for (k = 0; k < 3; k++)
5851 {
5852 for (i = 0; i < nelt; i++)
5853 if (3 * i + k < 2 * nelt)
5854 sel[i] = 3 * i + k;
5855 else
5856 sel[i] = 0;
5857 indices.new_vector (sel, 2, nelt);
5858 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5859
5860 for (i = 0, j = 0; i < nelt; i++)
5861 if (3 * i + k < 2 * nelt)
5862 sel[i] = i;
5863 else
5864 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5865 indices.new_vector (sel, 2, nelt);
5866 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5867
5868 first_vect = dr_chain[0];
5869 second_vect = dr_chain[1];
5870
5871 /* Create interleaving stmt (low part of):
5872 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5873 ...}> */
5874 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5875 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5876 second_vect, perm3_mask_low);
5877 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5878
5879 /* Create interleaving stmt (high part of):
5880 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5881 ...}> */
5882 first_vect = data_ref;
5883 second_vect = dr_chain[2];
5884 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5885 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5886 second_vect, perm3_mask_high);
5887 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5888 (*result_chain)[k] = data_ref;
5889 }
5890 }
5891 else
5892 {
5893 /* If length is not equal to 3 then only power of 2 is supported. */
5894 gcc_assert (pow2p_hwi (length));
5895
5896 /* The encoding has a single stepped pattern. */
5897 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5898 vec_perm_builder sel (nelt, 1, 3);
5899 sel.quick_grow (3);
5900 for (i = 0; i < 3; ++i)
5901 sel[i] = i * 2;
5902 vec_perm_indices indices (sel, 2, nelt);
5903 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5904
5905 for (i = 0; i < 3; ++i)
5906 sel[i] = i * 2 + 1;
5907 indices.new_vector (sel, 2, nelt);
5908 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5909
5910 for (i = 0; i < log_length; i++)
5911 {
5912 for (j = 0; j < length; j += 2)
5913 {
5914 first_vect = dr_chain[j];
5915 second_vect = dr_chain[j+1];
5916
5917 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5918 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5919 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5920 first_vect, second_vect,
5921 perm_mask_even);
5922 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5923 (*result_chain)[j/2] = data_ref;
5924
5925 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5926 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5927 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5928 first_vect, second_vect,
5929 perm_mask_odd);
5930 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5931 (*result_chain)[j/2+length/2] = data_ref;
5932 }
5933 memcpy (dr_chain.address (), result_chain->address (),
5934 length * sizeof (tree));
5935 }
5936 }
5937 }
5938
5939 /* Function vect_shift_permute_load_chain.
5940
5941 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5942 sequence of stmts to reorder the input data accordingly.
5943 Return the final references for loads in RESULT_CHAIN.
5944 Return true if successed, false otherwise.
5945
5946 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5947 The input is 3 vectors each containing 8 elements. We assign a
5948 number to each element, the input sequence is:
5949
5950 1st vec: 0 1 2 3 4 5 6 7
5951 2nd vec: 8 9 10 11 12 13 14 15
5952 3rd vec: 16 17 18 19 20 21 22 23
5953
5954 The output sequence should be:
5955
5956 1st vec: 0 3 6 9 12 15 18 21
5957 2nd vec: 1 4 7 10 13 16 19 22
5958 3rd vec: 2 5 8 11 14 17 20 23
5959
5960 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5961
5962 First we shuffle all 3 vectors to get correct elements order:
5963
5964 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5965 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5966 3rd vec: (16 19 22) (17 20 23) (18 21)
5967
5968 Next we unite and shift vector 3 times:
5969
5970 1st step:
5971 shift right by 6 the concatenation of:
5972 "1st vec" and "2nd vec"
5973 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5974 "2nd vec" and "3rd vec"
5975 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5976 "3rd vec" and "1st vec"
5977 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5978 | New vectors |
5979
5980 So that now new vectors are:
5981
5982 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5983 2nd vec: (10 13) (16 19 22) (17 20 23)
5984 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5985
5986 2nd step:
5987 shift right by 5 the concatenation of:
5988 "1st vec" and "3rd vec"
5989 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5990 "2nd vec" and "1st vec"
5991 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5992 "3rd vec" and "2nd vec"
5993 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5994 | New vectors |
5995
5996 So that now new vectors are:
5997
5998 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5999 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6000 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6001
6002 3rd step:
6003 shift right by 5 the concatenation of:
6004 "1st vec" and "1st vec"
6005 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6006 shift right by 3 the concatenation of:
6007 "2nd vec" and "2nd vec"
6008 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6009 | New vectors |
6010
6011 So that now all vectors are READY:
6012 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6013 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6014 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6015
6016 This algorithm is faster than one in vect_permute_load_chain if:
6017 1. "shift of a concatination" is faster than general permutation.
6018 This is usually so.
6019 2. The TARGET machine can't execute vector instructions in parallel.
6020 This is because each step of the algorithm depends on previous.
6021 The algorithm in vect_permute_load_chain is much more parallel.
6022
6023 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6024 */
6025
6026 static bool
6027 vect_shift_permute_load_chain (vec<tree> dr_chain,
6028 unsigned int length,
6029 stmt_vec_info stmt_info,
6030 gimple_stmt_iterator *gsi,
6031 vec<tree> *result_chain)
6032 {
6033 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6034 tree perm2_mask1, perm2_mask2, perm3_mask;
6035 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6036 gimple *perm_stmt;
6037
6038 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6039 unsigned int i;
6040 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6041
6042 unsigned HOST_WIDE_INT nelt, vf;
6043 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6044 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6045 /* Not supported for variable-length vectors. */
6046 return false;
6047
6048 vec_perm_builder sel (nelt, nelt, 1);
6049 sel.quick_grow (nelt);
6050
6051 result_chain->quick_grow (length);
6052 memcpy (result_chain->address (), dr_chain.address (),
6053 length * sizeof (tree));
6054
6055 if (pow2p_hwi (length) && vf > 4)
6056 {
6057 unsigned int j, log_length = exact_log2 (length);
6058 for (i = 0; i < nelt / 2; ++i)
6059 sel[i] = i * 2;
6060 for (i = 0; i < nelt / 2; ++i)
6061 sel[nelt / 2 + i] = i * 2 + 1;
6062 vec_perm_indices indices (sel, 2, nelt);
6063 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6064 {
6065 if (dump_enabled_p ())
6066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6067 "shuffle of 2 fields structure is not \
6068 supported by target\n");
6069 return false;
6070 }
6071 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6072
6073 for (i = 0; i < nelt / 2; ++i)
6074 sel[i] = i * 2 + 1;
6075 for (i = 0; i < nelt / 2; ++i)
6076 sel[nelt / 2 + i] = i * 2;
6077 indices.new_vector (sel, 2, nelt);
6078 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6079 {
6080 if (dump_enabled_p ())
6081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6082 "shuffle of 2 fields structure is not \
6083 supported by target\n");
6084 return false;
6085 }
6086 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6087
6088 /* Generating permutation constant to shift all elements.
6089 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6090 for (i = 0; i < nelt; i++)
6091 sel[i] = nelt / 2 + i;
6092 indices.new_vector (sel, 2, nelt);
6093 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6094 {
6095 if (dump_enabled_p ())
6096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097 "shift permutation is not supported by target\n");
6098 return false;
6099 }
6100 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6101
6102 /* Generating permutation constant to select vector from 2.
6103 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6104 for (i = 0; i < nelt / 2; i++)
6105 sel[i] = i;
6106 for (i = nelt / 2; i < nelt; i++)
6107 sel[i] = nelt + i;
6108 indices.new_vector (sel, 2, nelt);
6109 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6110 {
6111 if (dump_enabled_p ())
6112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6113 "select is not supported by target\n");
6114 return false;
6115 }
6116 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6117
6118 for (i = 0; i < log_length; i++)
6119 {
6120 for (j = 0; j < length; j += 2)
6121 {
6122 first_vect = dr_chain[j];
6123 second_vect = dr_chain[j + 1];
6124
6125 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6126 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6127 first_vect, first_vect,
6128 perm2_mask1);
6129 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6130 vect[0] = data_ref;
6131
6132 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6133 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6134 second_vect, second_vect,
6135 perm2_mask2);
6136 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6137 vect[1] = data_ref;
6138
6139 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6140 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6141 vect[0], vect[1], shift1_mask);
6142 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6143 (*result_chain)[j/2 + length/2] = data_ref;
6144
6145 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6146 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6147 vect[0], vect[1], select_mask);
6148 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6149 (*result_chain)[j/2] = data_ref;
6150 }
6151 memcpy (dr_chain.address (), result_chain->address (),
6152 length * sizeof (tree));
6153 }
6154 return true;
6155 }
6156 if (length == 3 && vf > 2)
6157 {
6158 unsigned int k = 0, l = 0;
6159
6160 /* Generating permutation constant to get all elements in rigth order.
6161 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6162 for (i = 0; i < nelt; i++)
6163 {
6164 if (3 * k + (l % 3) >= nelt)
6165 {
6166 k = 0;
6167 l += (3 - (nelt % 3));
6168 }
6169 sel[i] = 3 * k + (l % 3);
6170 k++;
6171 }
6172 vec_perm_indices indices (sel, 2, nelt);
6173 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6174 {
6175 if (dump_enabled_p ())
6176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6177 "shuffle of 3 fields structure is not \
6178 supported by target\n");
6179 return false;
6180 }
6181 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6182
6183 /* Generating permutation constant to shift all elements.
6184 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6185 for (i = 0; i < nelt; i++)
6186 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6187 indices.new_vector (sel, 2, nelt);
6188 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6189 {
6190 if (dump_enabled_p ())
6191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6192 "shift permutation is not supported by target\n");
6193 return false;
6194 }
6195 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6196
6197 /* Generating permutation constant to shift all elements.
6198 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6199 for (i = 0; i < nelt; i++)
6200 sel[i] = 2 * (nelt / 3) + 1 + i;
6201 indices.new_vector (sel, 2, nelt);
6202 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6203 {
6204 if (dump_enabled_p ())
6205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6206 "shift permutation is not supported by target\n");
6207 return false;
6208 }
6209 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6210
6211 /* Generating permutation constant to shift all elements.
6212 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6213 for (i = 0; i < nelt; i++)
6214 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6215 indices.new_vector (sel, 2, nelt);
6216 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6217 {
6218 if (dump_enabled_p ())
6219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6220 "shift permutation is not supported by target\n");
6221 return false;
6222 }
6223 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6224
6225 /* Generating permutation constant to shift all elements.
6226 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6227 for (i = 0; i < nelt; i++)
6228 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6229 indices.new_vector (sel, 2, nelt);
6230 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6231 {
6232 if (dump_enabled_p ())
6233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6234 "shift permutation is not supported by target\n");
6235 return false;
6236 }
6237 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6238
6239 for (k = 0; k < 3; k++)
6240 {
6241 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6242 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6243 dr_chain[k], dr_chain[k],
6244 perm3_mask);
6245 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6246 vect[k] = data_ref;
6247 }
6248
6249 for (k = 0; k < 3; k++)
6250 {
6251 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6252 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253 vect[k % 3], vect[(k + 1) % 3],
6254 shift1_mask);
6255 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6256 vect_shift[k] = data_ref;
6257 }
6258
6259 for (k = 0; k < 3; k++)
6260 {
6261 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6262 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6263 vect_shift[(4 - k) % 3],
6264 vect_shift[(3 - k) % 3],
6265 shift2_mask);
6266 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6267 vect[k] = data_ref;
6268 }
6269
6270 (*result_chain)[3 - (nelt % 3)] = vect[2];
6271
6272 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6273 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6274 vect[0], shift3_mask);
6275 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6276 (*result_chain)[nelt % 3] = data_ref;
6277
6278 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6279 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6280 vect[1], shift4_mask);
6281 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6282 (*result_chain)[0] = data_ref;
6283 return true;
6284 }
6285 return false;
6286 }
6287
6288 /* Function vect_transform_grouped_load.
6289
6290 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6291 to perform their permutation and ascribe the result vectorized statements to
6292 the scalar statements.
6293 */
6294
6295 void
6296 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6297 int size, gimple_stmt_iterator *gsi)
6298 {
6299 machine_mode mode;
6300 vec<tree> result_chain = vNULL;
6301
6302 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6303 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6304 vectors, that are ready for vector computation. */
6305 result_chain.create (size);
6306
6307 /* If reassociation width for vector type is 2 or greater target machine can
6308 execute 2 or more vector instructions in parallel. Otherwise try to
6309 get chain for loads group using vect_shift_permute_load_chain. */
6310 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6311 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6312 || pow2p_hwi (size)
6313 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6314 gsi, &result_chain))
6315 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6316 vect_record_grouped_load_vectors (stmt_info, result_chain);
6317 result_chain.release ();
6318 }
6319
6320 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6321 generated as part of the vectorization of STMT_INFO. Assign the statement
6322 for each vector to the associated scalar statement. */
6323
6324 void
6325 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6326 vec<tree> result_chain)
6327 {
6328 vec_info *vinfo = stmt_info->vinfo;
6329 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6330 unsigned int i, gap_count;
6331 tree tmp_data_ref;
6332
6333 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6334 Since we scan the chain starting from it's first node, their order
6335 corresponds the order of data-refs in RESULT_CHAIN. */
6336 stmt_vec_info next_stmt_info = first_stmt_info;
6337 gap_count = 1;
6338 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6339 {
6340 if (!next_stmt_info)
6341 break;
6342
6343 /* Skip the gaps. Loads created for the gaps will be removed by dead
6344 code elimination pass later. No need to check for the first stmt in
6345 the group, since it always exists.
6346 DR_GROUP_GAP is the number of steps in elements from the previous
6347 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6348 correspond to the gaps. */
6349 if (next_stmt_info != first_stmt_info
6350 && gap_count < DR_GROUP_GAP (next_stmt_info))
6351 {
6352 gap_count++;
6353 continue;
6354 }
6355
6356 /* ??? The following needs cleanup after the removal of
6357 DR_GROUP_SAME_DR_STMT. */
6358 if (next_stmt_info)
6359 {
6360 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6361 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6362 copies, and we put the new vector statement in the first available
6363 RELATED_STMT. */
6364 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6365 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6366 else
6367 {
6368 stmt_vec_info prev_stmt_info
6369 = STMT_VINFO_VEC_STMT (next_stmt_info);
6370 stmt_vec_info rel_stmt_info
6371 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6372 while (rel_stmt_info)
6373 {
6374 prev_stmt_info = rel_stmt_info;
6375 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6376 }
6377
6378 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6379 }
6380
6381 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6382 gap_count = 1;
6383 }
6384 }
6385 }
6386
6387 /* Function vect_force_dr_alignment_p.
6388
6389 Returns whether the alignment of a DECL can be forced to be aligned
6390 on ALIGNMENT bit boundary. */
6391
6392 bool
6393 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6394 {
6395 if (!VAR_P (decl))
6396 return false;
6397
6398 if (decl_in_symtab_p (decl)
6399 && !symtab_node::get (decl)->can_increase_alignment_p ())
6400 return false;
6401
6402 if (TREE_STATIC (decl))
6403 return (known_le (alignment,
6404 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6405 else
6406 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6407 }
6408
6409
6410 /* Return whether the data reference DR_INFO is supported with respect to its
6411 alignment.
6412 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6413 it is aligned, i.e., check if it is possible to vectorize it with different
6414 alignment. */
6415
6416 enum dr_alignment_support
6417 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6418 bool check_aligned_accesses)
6419 {
6420 data_reference *dr = dr_info->dr;
6421 stmt_vec_info stmt_info = dr_info->stmt;
6422 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6423 machine_mode mode = TYPE_MODE (vectype);
6424 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6425 struct loop *vect_loop = NULL;
6426 bool nested_in_vect_loop = false;
6427
6428 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6429 return dr_aligned;
6430
6431 /* For now assume all conditional loads/stores support unaligned
6432 access without any special code. */
6433 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6434 if (gimple_call_internal_p (stmt)
6435 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6436 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6437 return dr_unaligned_supported;
6438
6439 if (loop_vinfo)
6440 {
6441 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6442 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6443 }
6444
6445 /* Possibly unaligned access. */
6446
6447 /* We can choose between using the implicit realignment scheme (generating
6448 a misaligned_move stmt) and the explicit realignment scheme (generating
6449 aligned loads with a REALIGN_LOAD). There are two variants to the
6450 explicit realignment scheme: optimized, and unoptimized.
6451 We can optimize the realignment only if the step between consecutive
6452 vector loads is equal to the vector size. Since the vector memory
6453 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6454 is guaranteed that the misalignment amount remains the same throughout the
6455 execution of the vectorized loop. Therefore, we can create the
6456 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6457 at the loop preheader.
6458
6459 However, in the case of outer-loop vectorization, when vectorizing a
6460 memory access in the inner-loop nested within the LOOP that is now being
6461 vectorized, while it is guaranteed that the misalignment of the
6462 vectorized memory access will remain the same in different outer-loop
6463 iterations, it is *not* guaranteed that is will remain the same throughout
6464 the execution of the inner-loop. This is because the inner-loop advances
6465 with the original scalar step (and not in steps of VS). If the inner-loop
6466 step happens to be a multiple of VS, then the misalignment remains fixed
6467 and we can use the optimized realignment scheme. For example:
6468
6469 for (i=0; i<N; i++)
6470 for (j=0; j<M; j++)
6471 s += a[i+j];
6472
6473 When vectorizing the i-loop in the above example, the step between
6474 consecutive vector loads is 1, and so the misalignment does not remain
6475 fixed across the execution of the inner-loop, and the realignment cannot
6476 be optimized (as illustrated in the following pseudo vectorized loop):
6477
6478 for (i=0; i<N; i+=4)
6479 for (j=0; j<M; j++){
6480 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6481 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6482 // (assuming that we start from an aligned address).
6483 }
6484
6485 We therefore have to use the unoptimized realignment scheme:
6486
6487 for (i=0; i<N; i+=4)
6488 for (j=k; j<M; j+=4)
6489 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6490 // that the misalignment of the initial address is
6491 // 0).
6492
6493 The loop can then be vectorized as follows:
6494
6495 for (k=0; k<4; k++){
6496 rt = get_realignment_token (&vp[k]);
6497 for (i=0; i<N; i+=4){
6498 v1 = vp[i+k];
6499 for (j=k; j<M; j+=4){
6500 v2 = vp[i+j+VS-1];
6501 va = REALIGN_LOAD <v1,v2,rt>;
6502 vs += va;
6503 v1 = v2;
6504 }
6505 }
6506 } */
6507
6508 if (DR_IS_READ (dr))
6509 {
6510 bool is_packed = false;
6511 tree type = (TREE_TYPE (DR_REF (dr)));
6512
6513 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6514 && (!targetm.vectorize.builtin_mask_for_load
6515 || targetm.vectorize.builtin_mask_for_load ()))
6516 {
6517 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6518
6519 /* If we are doing SLP then the accesses need not have the
6520 same alignment, instead it depends on the SLP group size. */
6521 if (loop_vinfo
6522 && STMT_SLP_TYPE (stmt_info)
6523 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6524 * (DR_GROUP_SIZE
6525 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6526 TYPE_VECTOR_SUBPARTS (vectype)))
6527 ;
6528 else if (!loop_vinfo
6529 || (nested_in_vect_loop
6530 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6531 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6532 return dr_explicit_realign;
6533 else
6534 return dr_explicit_realign_optimized;
6535 }
6536 if (!known_alignment_for_access_p (dr_info))
6537 is_packed = not_size_aligned (DR_REF (dr));
6538
6539 if (targetm.vectorize.support_vector_misalignment
6540 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6541 /* Can't software pipeline the loads, but can at least do them. */
6542 return dr_unaligned_supported;
6543 }
6544 else
6545 {
6546 bool is_packed = false;
6547 tree type = (TREE_TYPE (DR_REF (dr)));
6548
6549 if (!known_alignment_for_access_p (dr_info))
6550 is_packed = not_size_aligned (DR_REF (dr));
6551
6552 if (targetm.vectorize.support_vector_misalignment
6553 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6554 return dr_unaligned_supported;
6555 }
6556
6557 /* Unsupported. */
6558 return dr_unaligned_unsupported;
6559 }