tree-vect-data-refs.c (vect_grouped_store_supported): Fix typo "permutaion".
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 {
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
147 }
148
149 *lhs_size_unit = lhs;
150 *rhs_size_unit = rhs;
151 return scalar_type;
152 }
153
154
155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
156 tested at run-time. Return TRUE if DDR was successfully inserted.
157 Return false if versioning is not supported. */
158
159 static bool
160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
161 {
162 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
163
164 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
165 return false;
166
167 if (!runtime_alias_check_p (ddr, loop,
168 optimize_loop_nest_for_speed_p (loop)))
169 return false;
170
171 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
172 return true;
173 }
174
175 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
176
177 static void
178 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
179 {
180 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
181 for (unsigned int i = 0; i < checks.length(); ++i)
182 if (checks[i] == value)
183 return;
184
185 if (dump_enabled_p ())
186 {
187 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
188 dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
189 dump_printf (MSG_NOTE, " is nonzero\n");
190 }
191 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
192 }
193
194 /* Return true if we know that the order of vectorized DR_INFO_A and
195 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
196 DR_INFO_B. At least one of the accesses is a write. */
197
198 static bool
199 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
200 {
201 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
202 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
203
204 /* Single statements are always kept in their original order. */
205 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
206 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
207 return true;
208
209 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
210 group are emitted at the position of the first scalar load and all
211 stores in a group are emitted at the position of the last scalar store.
212 Thus writes will happen no earlier than their current position
213 (but could happen later) while reads will happen no later than their
214 current position (but could happen earlier). Reordering is therefore
215 only possible if the first access is a write. */
216 stmtinfo_a = vect_orig_stmt (stmtinfo_a);
217 stmtinfo_b = vect_orig_stmt (stmtinfo_b);
218 stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
219 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
220 }
221
222 /* A subroutine of vect_analyze_data_ref_dependence. Handle
223 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
224 distances. These distances are conservatively correct but they don't
225 reflect a guaranteed dependence.
226
227 Return true if this function does all the work necessary to avoid
228 an alias or false if the caller should use the dependence distances
229 to limit the vectorization factor in the usual way. LOOP_DEPTH is
230 the depth of the loop described by LOOP_VINFO and the other arguments
231 are as for vect_analyze_data_ref_dependence. */
232
233 static bool
234 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
235 loop_vec_info loop_vinfo,
236 int loop_depth, unsigned int *max_vf)
237 {
238 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
239 lambda_vector dist_v;
240 unsigned int i;
241 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
242 {
243 int dist = dist_v[loop_depth];
244 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
245 {
246 /* If the user asserted safelen >= DIST consecutive iterations
247 can be executed concurrently, assume independence.
248
249 ??? An alternative would be to add the alias check even
250 in this case, and vectorize the fallback loop with the
251 maximum VF set to safelen. However, if the user has
252 explicitly given a length, it's less likely that that
253 would be a win. */
254 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
255 {
256 if ((unsigned int) loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 continue;
260 }
261
262 /* For dependence distances of 2 or more, we have the option
263 of limiting VF or checking for an alias at runtime.
264 Prefer to check at runtime if we can, to avoid limiting
265 the VF unnecessarily when the bases are in fact independent.
266
267 Note that the alias checks will be removed if the VF ends up
268 being small enough. */
269 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
270 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
271 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
272 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
273 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
274 }
275 }
276 return true;
277 }
278
279
280 /* Function vect_analyze_data_ref_dependence.
281
282 Return TRUE if there (might) exist a dependence between a memory-reference
283 DRA and a memory-reference DRB. When versioning for alias may check a
284 dependence at run-time, return FALSE. Adjust *MAX_VF according to
285 the data dependence. */
286
287 static bool
288 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
289 loop_vec_info loop_vinfo,
290 unsigned int *max_vf)
291 {
292 unsigned int i;
293 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
294 struct data_reference *dra = DDR_A (ddr);
295 struct data_reference *drb = DDR_B (ddr);
296 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
297 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
298 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
299 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
300 lambda_vector dist_v;
301 unsigned int loop_depth;
302
303 /* In loop analysis all data references should be vectorizable. */
304 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
305 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
306 gcc_unreachable ();
307
308 /* Independent data accesses. */
309 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
310 return false;
311
312 if (dra == drb
313 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
314 return false;
315
316 /* We do not have to consider dependences between accesses that belong
317 to the same group, unless the stride could be smaller than the
318 group size. */
319 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
320 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
321 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
322 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
323 return false;
324
325 /* Even if we have an anti-dependence then, as the vectorized loop covers at
326 least two scalar iterations, there is always also a true dependence.
327 As the vectorizer does not re-order loads and stores we can ignore
328 the anti-dependence if TBAA can disambiguate both DRs similar to the
329 case with known negative distance anti-dependences (positive
330 distance anti-dependences would violate TBAA constraints). */
331 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
332 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
333 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
334 get_alias_set (DR_REF (drb))))
335 return false;
336
337 /* Unknown data dependence. */
338 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
339 {
340 /* If user asserted safelen consecutive iterations can be
341 executed concurrently, assume independence. */
342 if (loop->safelen >= 2)
343 {
344 if ((unsigned int) loop->safelen < *max_vf)
345 *max_vf = loop->safelen;
346 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
347 return false;
348 }
349
350 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
351 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
352 {
353 if (dump_enabled_p ())
354 {
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "versioning for alias not supported for: "
357 "can't determine dependence between ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
359 DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
362 DR_REF (drb));
363 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 }
365 return true;
366 }
367
368 if (dump_enabled_p ())
369 {
370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
371 "versioning for alias required: "
372 "can't determine dependence between ");
373 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
374 DR_REF (dra));
375 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
376 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
377 DR_REF (drb));
378 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
379 }
380
381 /* Add to list of ddrs that need to be tested at run-time. */
382 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
383 }
384
385 /* Known data dependence. */
386 if (DDR_NUM_DIST_VECTS (ddr) == 0)
387 {
388 /* If user asserted safelen consecutive iterations can be
389 executed concurrently, assume independence. */
390 if (loop->safelen >= 2)
391 {
392 if ((unsigned int) loop->safelen < *max_vf)
393 *max_vf = loop->safelen;
394 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
395 return false;
396 }
397
398 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
399 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
400 {
401 if (dump_enabled_p ())
402 {
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "versioning for alias not supported for: "
405 "bad dist vector for ");
406 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
407 DR_REF (dra));
408 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410 DR_REF (drb));
411 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
412 }
413 return true;
414 }
415
416 if (dump_enabled_p ())
417 {
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "versioning for alias required: "
420 "bad dist vector for ");
421 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
422 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
423 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
424 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 }
426 /* Add to list of ddrs that need to be tested at run-time. */
427 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
428 }
429
430 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
431
432 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
433 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
434 loop_depth, max_vf))
435 return false;
436
437 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
438 {
439 int dist = dist_v[loop_depth];
440
441 if (dump_enabled_p ())
442 dump_printf_loc (MSG_NOTE, vect_location,
443 "dependence distance = %d.\n", dist);
444
445 if (dist == 0)
446 {
447 if (dump_enabled_p ())
448 {
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "dependence distance == 0 between ");
451 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
452 dump_printf (MSG_NOTE, " and ");
453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
454 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
455 }
456
457 /* When we perform grouped accesses and perform implicit CSE
458 by detecting equal accesses and doing disambiguation with
459 runtime alias tests like for
460 .. = a[i];
461 .. = a[i+1];
462 a[i] = ..;
463 a[i+1] = ..;
464 *p = ..;
465 .. = a[i];
466 .. = a[i+1];
467 where we will end up loading { a[i], a[i+1] } once, make
468 sure that inserting group loads before the first load and
469 stores after the last store will do the right thing.
470 Similar for groups like
471 a[i] = ...;
472 ... = a[i];
473 a[i+1] = ...;
474 where loads from the group interleave with the store. */
475 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
476 {
477 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "READ_WRITE dependence in interleaving.\n");
480 return true;
481 }
482
483 if (loop->safelen < 2)
484 {
485 tree indicator = dr_zero_step_indicator (dra);
486 if (!indicator || integer_zerop (indicator))
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "access also has a zero step\n");
491 return true;
492 }
493 else if (TREE_CODE (indicator) != INTEGER_CST)
494 vect_check_nonzero_value (loop_vinfo, indicator);
495 }
496 continue;
497 }
498
499 if (dist > 0 && DDR_REVERSED_P (ddr))
500 {
501 /* If DDR_REVERSED_P the order of the data-refs in DDR was
502 reversed (to make distance vector positive), and the actual
503 distance is negative. */
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "dependence distance negative.\n");
507 /* Record a negative dependence distance to later limit the
508 amount of stmt copying / unrolling we can perform.
509 Only need to handle read-after-write dependence. */
510 if (DR_IS_READ (drb)
511 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
512 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
513 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
514 continue;
515 }
516
517 unsigned int abs_dist = abs (dist);
518 if (abs_dist >= 2 && abs_dist < *max_vf)
519 {
520 /* The dependence distance requires reduction of the maximal
521 vectorization factor. */
522 *max_vf = abs (dist);
523 if (dump_enabled_p ())
524 dump_printf_loc (MSG_NOTE, vect_location,
525 "adjusting maximal vectorization factor to %i\n",
526 *max_vf);
527 }
528
529 if (abs_dist >= *max_vf)
530 {
531 /* Dependence distance does not create dependence, as far as
532 vectorization is concerned, in this case. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance >= VF.\n");
536 continue;
537 }
538
539 if (dump_enabled_p ())
540 {
541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542 "not vectorized, possible dependence "
543 "between data-refs ");
544 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
545 dump_printf (MSG_NOTE, " and ");
546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
547 dump_printf (MSG_NOTE, "\n");
548 }
549
550 return true;
551 }
552
553 return false;
554 }
555
556 /* Function vect_analyze_data_ref_dependences.
557
558 Examine all the data references in the loop, and make sure there do not
559 exist any data dependences between them. Set *MAX_VF according to
560 the maximum vectorization factor the data dependences allow. */
561
562 bool
563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
564 unsigned int *max_vf)
565 {
566 unsigned int i;
567 struct data_dependence_relation *ddr;
568
569 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
570
571 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
572 {
573 LOOP_VINFO_DDRS (loop_vinfo)
574 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
575 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
576 /* We need read-read dependences to compute
577 STMT_VINFO_SAME_ALIGN_REFS. */
578 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
579 &LOOP_VINFO_DDRS (loop_vinfo),
580 LOOP_VINFO_LOOP_NEST (loop_vinfo),
581 true);
582 gcc_assert (res);
583 }
584
585 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
586
587 /* For epilogues we either have no aliases or alias versioning
588 was applied to original loop. Therefore we may just get max_vf
589 using VF of original loop. */
590 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
591 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
592 else
593 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
594 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
595 return false;
596
597 return true;
598 }
599
600
601 /* Function vect_slp_analyze_data_ref_dependence.
602
603 Return TRUE if there (might) exist a dependence between a memory-reference
604 DRA and a memory-reference DRB for VINFO. When versioning for alias
605 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
606 according to the data dependence. */
607
608 static bool
609 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
610 struct data_dependence_relation *ddr)
611 {
612 struct data_reference *dra = DDR_A (ddr);
613 struct data_reference *drb = DDR_B (ddr);
614 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
615 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
616
617 /* We need to check dependences of statements marked as unvectorizable
618 as well, they still can prohibit vectorization. */
619
620 /* Independent data accesses. */
621 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
622 return false;
623
624 if (dra == drb)
625 return false;
626
627 /* Read-read is OK. */
628 if (DR_IS_READ (dra) && DR_IS_READ (drb))
629 return false;
630
631 /* If dra and drb are part of the same interleaving chain consider
632 them independent. */
633 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
634 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
635 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
636 return false;
637
638 /* Unknown data dependence. */
639 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
640 {
641 if (dump_enabled_p ())
642 {
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "can't determine dependence between ");
645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
646 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
647 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649 }
650 }
651 else if (dump_enabled_p ())
652 {
653 dump_printf_loc (MSG_NOTE, vect_location,
654 "determined dependence between ");
655 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
656 dump_printf (MSG_NOTE, " and ");
657 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
658 dump_printf (MSG_NOTE, "\n");
659 }
660
661 return true;
662 }
663
664
665 /* Analyze dependences involved in the transform of SLP NODE. STORES
666 contain the vector of scalar stores of this instance if we are
667 disambiguating the loads. */
668
669 static bool
670 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
671 vec<stmt_vec_info> stores,
672 stmt_vec_info last_store_info)
673 {
674 /* This walks over all stmts involved in the SLP load/store done
675 in NODE verifying we can sink them up to the last stmt in the
676 group. */
677 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
678 vec_info *vinfo = last_access_info->vinfo;
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 {
681 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
682 if (access_info == last_access_info)
683 continue;
684 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
685 ao_ref ref;
686 bool ref_initialized_p = false;
687 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
688 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
689 {
690 gimple *stmt = gsi_stmt (gsi);
691 if (! gimple_vuse (stmt)
692 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
693 continue;
694
695 /* If we couldn't record a (single) data reference for this
696 stmt we have to resort to the alias oracle. */
697 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
698 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
699 if (!dr_b)
700 {
701 /* We are moving a store or sinking a load - this means
702 we cannot use TBAA for disambiguation. */
703 if (!ref_initialized_p)
704 ao_ref_init (&ref, DR_REF (dr_a));
705 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
706 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
707 return false;
708 continue;
709 }
710
711 bool dependent = false;
712 /* If we run into a store of this same instance (we've just
713 marked those) then delay dependence checking until we run
714 into the last store because this is where it will have
715 been sunk to (and we verify if we can do that as well). */
716 if (gimple_visited_p (stmt))
717 {
718 if (stmt_info != last_store_info)
719 continue;
720 unsigned i;
721 stmt_vec_info store_info;
722 FOR_EACH_VEC_ELT (stores, i, store_info)
723 {
724 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
725 ddr_p ddr = initialize_data_dependence_relation
726 (dr_a, store_dr, vNULL);
727 dependent
728 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
729 free_dependence_relation (ddr);
730 if (dependent)
731 break;
732 }
733 }
734 else
735 {
736 ddr_p ddr = initialize_data_dependence_relation (dr_a,
737 dr_b, vNULL);
738 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
739 free_dependence_relation (ddr);
740 }
741 if (dependent)
742 return false;
743 }
744 }
745 return true;
746 }
747
748
749 /* Function vect_analyze_data_ref_dependences.
750
751 Examine all the data references in the basic-block, and make sure there
752 do not exist any data dependences between them. Set *MAX_VF according to
753 the maximum vectorization factor the data dependences allow. */
754
755 bool
756 vect_slp_analyze_instance_dependence (slp_instance instance)
757 {
758 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
759
760 /* The stores of this instance are at the root of the SLP tree. */
761 slp_tree store = SLP_INSTANCE_TREE (instance);
762 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
763 store = NULL;
764
765 /* Verify we can sink stores to the vectorized stmt insert location. */
766 stmt_vec_info last_store_info = NULL;
767 if (store)
768 {
769 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
770 return false;
771
772 /* Mark stores in this instance and remember the last one. */
773 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
774 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
775 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
776 }
777
778 bool res = true;
779
780 /* Verify we can sink loads to the vectorized stmt insert location,
781 special-casing stores of this instance. */
782 slp_tree load;
783 unsigned int i;
784 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
785 if (! vect_slp_analyze_node_dependences (instance, load,
786 store
787 ? SLP_TREE_SCALAR_STMTS (store)
788 : vNULL, last_store_info))
789 {
790 res = false;
791 break;
792 }
793
794 /* Unset the visited flag. */
795 if (store)
796 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
797 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
798
799 return res;
800 }
801
802 /* Record the base alignment guarantee given by DRB, which occurs
803 in STMT_INFO. */
804
805 static void
806 vect_record_base_alignment (stmt_vec_info stmt_info,
807 innermost_loop_behavior *drb)
808 {
809 vec_info *vinfo = stmt_info->vinfo;
810 bool existed;
811 innermost_loop_behavior *&entry
812 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
813 if (!existed || entry->base_alignment < drb->base_alignment)
814 {
815 entry = drb;
816 if (dump_enabled_p ())
817 {
818 dump_printf_loc (MSG_NOTE, vect_location,
819 "recording new base alignment for ");
820 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
821 dump_printf (MSG_NOTE, "\n");
822 dump_printf_loc (MSG_NOTE, vect_location,
823 " alignment: %d\n", drb->base_alignment);
824 dump_printf_loc (MSG_NOTE, vect_location,
825 " misalignment: %d\n", drb->base_misalignment);
826 dump_printf_loc (MSG_NOTE, vect_location,
827 " based on: ");
828 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
829 }
830 }
831 }
832
833 /* If the region we're going to vectorize is reached, all unconditional
834 data references occur at least once. We can therefore pool the base
835 alignment guarantees from each unconditional reference. Do this by
836 going through all the data references in VINFO and checking whether
837 the containing statement makes the reference unconditionally. If so,
838 record the alignment of the base address in VINFO so that it can be
839 used for all other references with the same base. */
840
841 void
842 vect_record_base_alignments (vec_info *vinfo)
843 {
844 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
845 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
846 data_reference *dr;
847 unsigned int i;
848 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
849 {
850 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
851 stmt_vec_info stmt_info = dr_info->stmt;
852 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
853 && STMT_VINFO_VECTORIZABLE (stmt_info)
854 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
855 {
856 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
857
858 /* If DR is nested in the loop that is being vectorized, we can also
859 record the alignment of the base wrt the outer loop. */
860 if (loop && nested_in_vect_loop_p (loop, stmt_info))
861 vect_record_base_alignment
862 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
863 }
864 }
865 }
866
867 /* Return the target alignment for the vectorized form of DR_INFO. */
868
869 static unsigned int
870 vect_calculate_target_alignment (dr_vec_info *dr_info)
871 {
872 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
873 return targetm.vectorize.preferred_vector_alignment (vectype);
874 }
875
876 /* Function vect_compute_data_ref_alignment
877
878 Compute the misalignment of the data reference DR_INFO.
879
880 Output:
881 1. DR_MISALIGNMENT (DR_INFO) is defined.
882
883 FOR NOW: No analysis is actually performed. Misalignment is calculated
884 only for trivial cases. TODO. */
885
886 static void
887 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
888 {
889 stmt_vec_info stmt_info = dr_info->stmt;
890 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
891 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
892 struct loop *loop = NULL;
893 tree ref = DR_REF (dr_info->dr);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
895
896 if (dump_enabled_p ())
897 dump_printf_loc (MSG_NOTE, vect_location,
898 "vect_compute_data_ref_alignment:\n");
899
900 if (loop_vinfo)
901 loop = LOOP_VINFO_LOOP (loop_vinfo);
902
903 /* Initialize misalignment to unknown. */
904 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
905
906 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
907 return;
908
909 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
910 bool step_preserves_misalignment_p;
911
912 unsigned HOST_WIDE_INT vector_alignment
913 = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
914 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
915
916 /* No step for BB vectorization. */
917 if (!loop)
918 {
919 gcc_assert (integer_zerop (drb->step));
920 step_preserves_misalignment_p = true;
921 }
922
923 /* In case the dataref is in an inner-loop of the loop that is being
924 vectorized (LOOP), we use the base and misalignment information
925 relative to the outer-loop (LOOP). This is ok only if the misalignment
926 stays the same throughout the execution of the inner-loop, which is why
927 we have to check that the stride of the dataref in the inner-loop evenly
928 divides by the vector alignment. */
929 else if (nested_in_vect_loop_p (loop, stmt_info))
930 {
931 step_preserves_misalignment_p
932 = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
933
934 if (dump_enabled_p ())
935 {
936 if (step_preserves_misalignment_p)
937 dump_printf_loc (MSG_NOTE, vect_location,
938 "inner step divides the vector alignment.\n");
939 else
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "inner step doesn't divide the vector"
942 " alignment.\n");
943 }
944 }
945
946 /* Similarly we can only use base and misalignment information relative to
947 an innermost loop if the misalignment stays the same throughout the
948 execution of the loop. As above, this is the case if the stride of
949 the dataref evenly divides by the alignment. */
950 else
951 {
952 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
953 step_preserves_misalignment_p
954 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
955
956 if (!step_preserves_misalignment_p && dump_enabled_p ())
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "step doesn't divide the vector alignment.\n");
959 }
960
961 unsigned int base_alignment = drb->base_alignment;
962 unsigned int base_misalignment = drb->base_misalignment;
963
964 /* Calculate the maximum of the pooled base address alignment and the
965 alignment that we can compute for DR itself. */
966 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
967 if (entry && base_alignment < (*entry)->base_alignment)
968 {
969 base_alignment = (*entry)->base_alignment;
970 base_misalignment = (*entry)->base_misalignment;
971 }
972
973 if (drb->offset_alignment < vector_alignment
974 || !step_preserves_misalignment_p
975 /* We need to know whether the step wrt the vectorized loop is
976 negative when computing the starting misalignment below. */
977 || TREE_CODE (drb->step) != INTEGER_CST)
978 {
979 if (dump_enabled_p ())
980 {
981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
982 "Unknown alignment for access: ");
983 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
984 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
985 }
986 return;
987 }
988
989 if (base_alignment < vector_alignment)
990 {
991 unsigned int max_alignment;
992 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
993 if (max_alignment < vector_alignment
994 || !vect_can_force_dr_alignment_p (base,
995 vector_alignment * BITS_PER_UNIT))
996 {
997 if (dump_enabled_p ())
998 {
999 dump_printf_loc (MSG_NOTE, vect_location,
1000 "can't force alignment of ref: ");
1001 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1002 dump_printf (MSG_NOTE, "\n");
1003 }
1004 return;
1005 }
1006
1007 /* Force the alignment of the decl.
1008 NOTE: This is the only change to the code we make during
1009 the analysis phase, before deciding to vectorize the loop. */
1010 if (dump_enabled_p ())
1011 {
1012 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1013 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1014 dump_printf (MSG_NOTE, "\n");
1015 }
1016
1017 dr_info->base_decl = base;
1018 dr_info->base_misaligned = true;
1019 base_misalignment = 0;
1020 }
1021 poly_int64 misalignment
1022 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1023
1024 /* If this is a backward running DR then first access in the larger
1025 vectype actually is N-1 elements before the address in the DR.
1026 Adjust misalign accordingly. */
1027 if (tree_int_cst_sgn (drb->step) < 0)
1028 /* PLUS because STEP is negative. */
1029 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1030 * TREE_INT_CST_LOW (drb->step));
1031
1032 unsigned int const_misalignment;
1033 if (!known_misalignment (misalignment, vector_alignment,
1034 &const_misalignment))
1035 {
1036 if (dump_enabled_p ())
1037 {
1038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039 "Non-constant misalignment for access: ");
1040 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1041 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1042 }
1043 return;
1044 }
1045
1046 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1047
1048 if (dump_enabled_p ())
1049 {
1050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051 "misalign = %d bytes of ref ",
1052 DR_MISALIGNMENT (dr_info));
1053 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1054 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1055 }
1056
1057 return;
1058 }
1059
1060 /* Function vect_update_misalignment_for_peel.
1061 Sets DR_INFO's misalignment
1062 - to 0 if it has the same alignment as DR_PEEL_INFO,
1063 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1064 - to -1 (unknown) otherwise.
1065
1066 DR_INFO - the data reference whose misalignment is to be adjusted.
1067 DR_PEEL_INFO - the data reference whose misalignment is being made
1068 zero in the vector loop by the peel.
1069 NPEEL - the number of iterations in the peel loop if the misalignment
1070 of DR_PEEL_INFO is known at compile time. */
1071
1072 static void
1073 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1074 dr_vec_info *dr_peel_info, int npeel)
1075 {
1076 unsigned int i;
1077 vec<dr_p> same_aligned_drs;
1078 struct data_reference *current_dr;
1079 int dr_size = vect_get_scalar_dr_size (dr_info);
1080 int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1081 stmt_vec_info stmt_info = dr_info->stmt;
1082 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1083
1084 /* For interleaved data accesses the step in the loop must be multiplied by
1085 the size of the interleaving group. */
1086 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1087 dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1088 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1089 dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1090
1091 /* It can be assumed that the data refs with the same alignment as dr_peel
1092 are aligned in the vector loop. */
1093 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1094 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1095 {
1096 if (current_dr != dr_info->dr)
1097 continue;
1098 gcc_assert (!known_alignment_for_access_p (dr_info)
1099 || !known_alignment_for_access_p (dr_peel_info)
1100 || (DR_MISALIGNMENT (dr_info) / dr_size
1101 == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1102 SET_DR_MISALIGNMENT (dr_info, 0);
1103 return;
1104 }
1105
1106 if (known_alignment_for_access_p (dr_info)
1107 && known_alignment_for_access_p (dr_peel_info))
1108 {
1109 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1110 size_zero_node) < 0;
1111 int misal = DR_MISALIGNMENT (dr_info);
1112 misal += negative ? -npeel * dr_size : npeel * dr_size;
1113 misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1114 SET_DR_MISALIGNMENT (dr_info, misal);
1115 return;
1116 }
1117
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1120 "to unknown (-1).\n");
1121 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1122 }
1123
1124
1125 /* Function verify_data_ref_alignment
1126
1127 Return TRUE if DR_INFO can be handled with respect to alignment. */
1128
1129 static bool
1130 verify_data_ref_alignment (dr_vec_info *dr_info)
1131 {
1132 enum dr_alignment_support supportable_dr_alignment
1133 = vect_supportable_dr_alignment (dr_info, false);
1134 if (!supportable_dr_alignment)
1135 {
1136 if (dump_enabled_p ())
1137 {
1138 if (DR_IS_READ (dr_info->dr))
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "not vectorized: unsupported unaligned load.");
1141 else
1142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143 "not vectorized: unsupported unaligned "
1144 "store.");
1145
1146 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1147 DR_REF (dr_info->dr));
1148 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1149 }
1150 return false;
1151 }
1152
1153 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1154 dump_printf_loc (MSG_NOTE, vect_location,
1155 "Vectorizing an unaligned access.\n");
1156
1157 return true;
1158 }
1159
1160 /* Function vect_verify_datarefs_alignment
1161
1162 Return TRUE if all data references in the loop can be
1163 handled with respect to alignment. */
1164
1165 bool
1166 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1167 {
1168 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1169 struct data_reference *dr;
1170 unsigned int i;
1171
1172 FOR_EACH_VEC_ELT (datarefs, i, dr)
1173 {
1174 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1175 stmt_vec_info stmt_info = dr_info->stmt;
1176
1177 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1178 continue;
1179
1180 /* For interleaving, only the alignment of the first access matters. */
1181 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1182 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1183 continue;
1184
1185 /* Strided accesses perform only component accesses, alignment is
1186 irrelevant for them. */
1187 if (STMT_VINFO_STRIDED_P (stmt_info)
1188 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1189 continue;
1190
1191 if (! verify_data_ref_alignment (dr_info))
1192 return false;
1193 }
1194
1195 return true;
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199 than its size. */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205 return true;
1206
1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208 > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213 Return true if vector alignment for DR_INFO is reachable by peeling
1214 a few loop iterations. Return false otherwise. */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219 stmt_vec_info stmt_info = dr_info->stmt;
1220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223 {
1224 /* For interleaved access we peel only if number of iterations in
1225 the prolog loop ({VF - misalignment}), is a multiple of the
1226 number of the interleaved accesses. */
1227 int elem_size, mis_in_elements;
1228
1229 /* FORNOW: handle only known alignment. */
1230 if (!known_alignment_for_access_p (dr_info))
1231 return false;
1232
1233 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235 elem_size = vector_element_size (vector_size, nelements);
1236 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239 return false;
1240 }
1241
1242 /* If misalignment is known at the compile time then allow peeling
1243 only if natural alignment is reachable through peeling. */
1244 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245 {
1246 HOST_WIDE_INT elmsize =
1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248 if (dump_enabled_p ())
1249 {
1250 dump_printf_loc (MSG_NOTE, vect_location,
1251 "data size = %wd. misalignment = %d.\n", elmsize,
1252 DR_MISALIGNMENT (dr_info));
1253 }
1254 if (DR_MISALIGNMENT (dr_info) % elmsize)
1255 {
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "data size does not divide the misalignment.\n");
1259 return false;
1260 }
1261 }
1262
1263 if (!known_alignment_for_access_p (dr_info))
1264 {
1265 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Unknown misalignment, %snaturally aligned\n",
1270 is_packed ? "not " : "");
1271 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272 }
1273
1274 return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO. */
1279
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282 unsigned int *inside_cost,
1283 unsigned int *outside_cost,
1284 stmt_vector_for_cost *body_cost_vec,
1285 stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287 stmt_vec_info stmt_info = dr_info->stmt;
1288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289 int ncopies;
1290
1291 if (PURE_SLP_STMT (stmt_info))
1292 ncopies = 1;
1293 else
1294 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296 if (DR_IS_READ (dr_info->dr))
1297 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298 prologue_cost_vec, body_cost_vec, false);
1299 else
1300 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_NOTE, vect_location,
1304 "vect_get_data_access_cost: inside_cost = %d, "
1305 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311 dr_vec_info *dr_info;
1312 int npeel;
1313 unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318 struct _vect_peel_info peel_info;
1319 unsigned int inside_cost;
1320 unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322
1323
1324 /* Peeling hashtable helpers. */
1325
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328 static inline hashval_t hash (const _vect_peel_info *);
1329 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335 return (hashval_t) peel_info->npeel;
1336 }
1337
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341 return (a->npeel == b->npeel);
1342 }
1343
1344
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1346
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350 int npeel)
1351 {
1352 struct _vect_peel_info elem, *slot;
1353 _vect_peel_info **new_slot;
1354 bool supportable_dr_alignment
1355 = vect_supportable_dr_alignment (dr_info, true);
1356
1357 elem.npeel = npeel;
1358 slot = peeling_htab->find (&elem);
1359 if (slot)
1360 slot->count++;
1361 else
1362 {
1363 slot = XNEW (struct _vect_peel_info);
1364 slot->npeel = npeel;
1365 slot->dr_info = dr_info;
1366 slot->count = 1;
1367 new_slot = peeling_htab->find_slot (slot, INSERT);
1368 *new_slot = slot;
1369 }
1370
1371 if (!supportable_dr_alignment
1372 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373 slot->count += VECT_MAX_COST;
1374 }
1375
1376
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378 number of data accesses. */
1379
1380 int
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382 _vect_peel_extended_info *max)
1383 {
1384 vect_peel_info elem = *slot;
1385
1386 if (elem->count > max->peel_info.count
1387 || (elem->count == max->peel_info.count
1388 && max->peel_info.npeel > elem->npeel))
1389 {
1390 max->peel_info.npeel = elem->npeel;
1391 max->peel_info.count = elem->count;
1392 max->peel_info.dr_info = elem->dr_info;
1393 }
1394
1395 return 1;
1396 }
1397
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1400 we assume DR0_INFO's misalignment will be zero after peeling. */
1401
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404 dr_vec_info *dr0_info,
1405 unsigned int *inside_cost,
1406 unsigned int *outside_cost,
1407 stmt_vector_for_cost *body_cost_vec,
1408 stmt_vector_for_cost *prologue_cost_vec,
1409 unsigned int npeel,
1410 bool unknown_misalignment)
1411 {
1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413 unsigned i;
1414 data_reference *dr;
1415
1416 FOR_EACH_VEC_ELT (datarefs, i, dr)
1417 {
1418 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419 stmt_vec_info stmt_info = dr_info->stmt;
1420 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421 continue;
1422
1423 /* For interleaving, only the alignment of the first access
1424 matters. */
1425 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427 continue;
1428
1429 /* Strided accesses perform only component accesses, alignment is
1430 irrelevant for them. */
1431 if (STMT_VINFO_STRIDED_P (stmt_info)
1432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433 continue;
1434
1435 int save_misalignment;
1436 save_misalignment = DR_MISALIGNMENT (dr_info);
1437 if (npeel == 0)
1438 ;
1439 else if (unknown_misalignment && dr_info == dr0_info)
1440 SET_DR_MISALIGNMENT (dr_info, 0);
1441 else
1442 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444 body_cost_vec, prologue_cost_vec);
1445 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446 }
1447 }
1448
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450 Find the one with the lowest cost. */
1451
1452 int
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454 _vect_peel_extended_info *min)
1455 {
1456 vect_peel_info elem = *slot;
1457 int dummy;
1458 unsigned int inside_cost = 0, outside_cost = 0;
1459 stmt_vec_info stmt_info = elem->dr_info->stmt;
1460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462 epilogue_cost_vec;
1463
1464 prologue_cost_vec.create (2);
1465 body_cost_vec.create (2);
1466 epilogue_cost_vec.create (2);
1467
1468 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469 &outside_cost, &body_cost_vec,
1470 &prologue_cost_vec, elem->npeel, false);
1471
1472 body_cost_vec.release ();
1473
1474 outside_cost += vect_get_known_peeling_cost
1475 (loop_vinfo, elem->npeel, &dummy,
1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477 &prologue_cost_vec, &epilogue_cost_vec);
1478
1479 /* Prologue and epilogue costs are added to the target model later.
1480 These costs depend only on the scalar iteration cost, the
1481 number of peeling iterations finally chosen, and the number of
1482 misaligned statements. So discard the information found here. */
1483 prologue_cost_vec.release ();
1484 epilogue_cost_vec.release ();
1485
1486 if (inside_cost < min->inside_cost
1487 || (inside_cost == min->inside_cost
1488 && outside_cost < min->outside_cost))
1489 {
1490 min->inside_cost = inside_cost;
1491 min->outside_cost = outside_cost;
1492 min->peel_info.dr_info = elem->dr_info;
1493 min->peel_info.npeel = elem->npeel;
1494 min->peel_info.count = elem->count;
1495 }
1496
1497 return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502 choosing an option with the lowest cost (if cost model is enabled) or the
1503 option that aligns as many accesses as possible. */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507 loop_vec_info loop_vinfo)
1508 {
1509 struct _vect_peel_extended_info res;
1510
1511 res.peel_info.dr_info = NULL;
1512
1513 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 {
1515 res.inside_cost = INT_MAX;
1516 res.outside_cost = INT_MAX;
1517 peeling_htab->traverse <_vect_peel_extended_info *,
1518 vect_peeling_hash_get_lowest_cost> (&res);
1519 }
1520 else
1521 {
1522 res.peel_info.count = 0;
1523 peeling_htab->traverse <_vect_peel_extended_info *,
1524 vect_peeling_hash_get_most_frequent> (&res);
1525 res.inside_cost = 0;
1526 res.outside_cost = 0;
1527 }
1528
1529 return res;
1530 }
1531
1532 /* Return true if the new peeling NPEEL is supported. */
1533
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536 unsigned npeel)
1537 {
1538 unsigned i;
1539 struct data_reference *dr = NULL;
1540 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541 enum dr_alignment_support supportable_dr_alignment;
1542
1543 /* Ensure that all data refs can be vectorized after the peel. */
1544 FOR_EACH_VEC_ELT (datarefs, i, dr)
1545 {
1546 int save_misalignment;
1547
1548 if (dr == dr0_info->dr)
1549 continue;
1550
1551 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552 stmt_vec_info stmt_info = dr_info->stmt;
1553 /* For interleaving, only the alignment of the first access
1554 matters. */
1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557 continue;
1558
1559 /* Strided accesses perform only component accesses, alignment is
1560 irrelevant for them. */
1561 if (STMT_VINFO_STRIDED_P (stmt_info)
1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 continue;
1564
1565 save_misalignment = DR_MISALIGNMENT (dr_info);
1566 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567 supportable_dr_alignment
1568 = vect_supportable_dr_alignment (dr_info, false);
1569 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571 if (!supportable_dr_alignment)
1572 return false;
1573 }
1574
1575 return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1582
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1587
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1596
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1606 loads).
1607
1608 Here are the general steps involved in alignment enhancements:
1609
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1614 }
1615
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1620 }
1621
1622 -- Possibility 1: we do loop versioning:
1623 if (p is aligned) {
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1627 }
1628 }
1629 else {
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1633 }
1634 }
1635
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1640 }
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1644 }
1645
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1648 x = q[i];
1649 p[i] = y;
1650 }
1651 if (p is aligned) {
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1655 }
1656 }
1657 else {
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1661 }
1662 }
1663
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1667 misalignment). */
1668
1669 bool
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1675 dr_vec_info *first_store = NULL;
1676 dr_vec_info *dr0_info = NULL;
1677 struct data_reference *dr;
1678 unsigned int i, j;
1679 bool do_peeling = false;
1680 bool do_versioning = false;
1681 bool stat;
1682 unsigned int npeel = 0;
1683 bool one_misalignment_known = false;
1684 bool one_misalignment_unknown = false;
1685 bool one_dr_unsupportable = false;
1686 dr_vec_info *unsupportable_dr_info = NULL;
1687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688 unsigned possible_npeel_number = 1;
1689 tree vectype;
1690 unsigned int mis, same_align_drs_max = 0;
1691 hash_table<peel_info_hasher> peeling_htab (1);
1692
1693 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1694
1695 /* Reset data so we can safely be called multiple times. */
1696 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1698
1699 /* While cost model enhancements are expected in the future, the high level
1700 view of the code at this time is as follows:
1701
1702 A) If there is a misaligned access then see if peeling to align
1703 this access can make all data references satisfy
1704 vect_supportable_dr_alignment. If so, update data structures
1705 as needed and return true.
1706
1707 B) If peeling wasn't possible and there is a data reference with an
1708 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709 then see if loop versioning checks can be used to make all data
1710 references satisfy vect_supportable_dr_alignment. If so, update
1711 data structures as needed and return true.
1712
1713 C) If neither peeling nor versioning were successful then return false if
1714 any data reference does not satisfy vect_supportable_dr_alignment.
1715
1716 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1717
1718 Note, Possibility 3 above (which is peeling and versioning together) is not
1719 being done at this time. */
1720
1721 /* (1) Peeling to force alignment. */
1722
1723 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724 Considerations:
1725 + How many accesses will become aligned due to the peeling
1726 - How many accesses will become unaligned due to the peeling,
1727 and the cost of misaligned accesses.
1728 - The cost of peeling (the extra runtime checks, the increase
1729 in code size). */
1730
1731 FOR_EACH_VEC_ELT (datarefs, i, dr)
1732 {
1733 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1734 stmt_vec_info stmt_info = dr_info->stmt;
1735
1736 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737 continue;
1738
1739 /* For interleaving, only the alignment of the first access
1740 matters. */
1741 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1743 continue;
1744
1745 /* For scatter-gather or invariant accesses there is nothing
1746 to enhance. */
1747 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748 || integer_zerop (DR_STEP (dr)))
1749 continue;
1750
1751 /* Strided accesses perform only component accesses, alignment is
1752 irrelevant for them. */
1753 if (STMT_VINFO_STRIDED_P (stmt_info)
1754 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755 continue;
1756
1757 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1758 do_peeling = vector_alignment_reachable_p (dr_info);
1759 if (do_peeling)
1760 {
1761 if (known_alignment_for_access_p (dr_info))
1762 {
1763 unsigned int npeel_tmp = 0;
1764 bool negative = tree_int_cst_compare (DR_STEP (dr),
1765 size_zero_node) < 0;
1766
1767 vectype = STMT_VINFO_VECTYPE (stmt_info);
1768 unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1769 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1770 mis = (negative
1771 ? DR_MISALIGNMENT (dr_info)
1772 : -DR_MISALIGNMENT (dr_info));
1773 if (DR_MISALIGNMENT (dr_info) != 0)
1774 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1775
1776 /* For multiple types, it is possible that the bigger type access
1777 will have more than one peeling option. E.g., a loop with two
1778 types: one of size (vector size / 4), and the other one of
1779 size (vector size / 8). Vectorization factor will 8. If both
1780 accesses are misaligned by 3, the first one needs one scalar
1781 iteration to be aligned, and the second one needs 5. But the
1782 first one will be aligned also by peeling 5 scalar
1783 iterations, and in that case both accesses will be aligned.
1784 Hence, except for the immediate peeling amount, we also want
1785 to try to add full vector size, while we don't exceed
1786 vectorization factor.
1787 We do this automatically for cost model, since we calculate
1788 cost for every peeling option. */
1789 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1790 {
1791 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1792 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1793 possible_npeel_number
1794 = vect_get_num_vectors (nscalars, vectype);
1795
1796 /* NPEEL_TMP is 0 when there is no misalignment, but also
1797 allow peeling NELEMENTS. */
1798 if (DR_MISALIGNMENT (dr_info) == 0)
1799 possible_npeel_number++;
1800 }
1801
1802 /* Save info about DR in the hash table. Also include peeling
1803 amounts according to the explanation above. */
1804 for (j = 0; j < possible_npeel_number; j++)
1805 {
1806 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1807 dr_info, npeel_tmp);
1808 npeel_tmp += target_align / dr_size;
1809 }
1810
1811 one_misalignment_known = true;
1812 }
1813 else
1814 {
1815 /* If we don't know any misalignment values, we prefer
1816 peeling for data-ref that has the maximum number of data-refs
1817 with the same alignment, unless the target prefers to align
1818 stores over load. */
1819 unsigned same_align_drs
1820 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1821 if (!dr0_info
1822 || same_align_drs_max < same_align_drs)
1823 {
1824 same_align_drs_max = same_align_drs;
1825 dr0_info = dr_info;
1826 }
1827 /* For data-refs with the same number of related
1828 accesses prefer the one where the misalign
1829 computation will be invariant in the outermost loop. */
1830 else if (same_align_drs_max == same_align_drs)
1831 {
1832 struct loop *ivloop0, *ivloop;
1833 ivloop0 = outermost_invariant_loop_for_expr
1834 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1835 ivloop = outermost_invariant_loop_for_expr
1836 (loop, DR_BASE_ADDRESS (dr));
1837 if ((ivloop && !ivloop0)
1838 || (ivloop && ivloop0
1839 && flow_loop_nested_p (ivloop, ivloop0)))
1840 dr0_info = dr_info;
1841 }
1842
1843 one_misalignment_unknown = true;
1844
1845 /* Check for data refs with unsupportable alignment that
1846 can be peeled. */
1847 if (!supportable_dr_alignment)
1848 {
1849 one_dr_unsupportable = true;
1850 unsupportable_dr_info = dr_info;
1851 }
1852
1853 if (!first_store && DR_IS_WRITE (dr))
1854 first_store = dr_info;
1855 }
1856 }
1857 else
1858 {
1859 if (!aligned_access_p (dr_info))
1860 {
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863 "vector alignment may not be reachable\n");
1864 break;
1865 }
1866 }
1867 }
1868
1869 /* Check if we can possibly peel the loop. */
1870 if (!vect_can_advance_ivs_p (loop_vinfo)
1871 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1872 || loop->inner)
1873 do_peeling = false;
1874
1875 struct _vect_peel_extended_info peel_for_known_alignment;
1876 struct _vect_peel_extended_info peel_for_unknown_alignment;
1877 struct _vect_peel_extended_info best_peel;
1878
1879 peel_for_unknown_alignment.inside_cost = INT_MAX;
1880 peel_for_unknown_alignment.outside_cost = INT_MAX;
1881 peel_for_unknown_alignment.peel_info.count = 0;
1882
1883 if (do_peeling
1884 && one_misalignment_unknown)
1885 {
1886 /* Check if the target requires to prefer stores over loads, i.e., if
1887 misaligned stores are more expensive than misaligned loads (taking
1888 drs with same alignment into account). */
1889 unsigned int load_inside_cost = 0;
1890 unsigned int load_outside_cost = 0;
1891 unsigned int store_inside_cost = 0;
1892 unsigned int store_outside_cost = 0;
1893 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1894
1895 stmt_vector_for_cost dummy;
1896 dummy.create (2);
1897 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1898 &load_inside_cost,
1899 &load_outside_cost,
1900 &dummy, &dummy, estimated_npeels, true);
1901 dummy.release ();
1902
1903 if (first_store)
1904 {
1905 dummy.create (2);
1906 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1907 &store_inside_cost,
1908 &store_outside_cost,
1909 &dummy, &dummy,
1910 estimated_npeels, true);
1911 dummy.release ();
1912 }
1913 else
1914 {
1915 store_inside_cost = INT_MAX;
1916 store_outside_cost = INT_MAX;
1917 }
1918
1919 if (load_inside_cost > store_inside_cost
1920 || (load_inside_cost == store_inside_cost
1921 && load_outside_cost > store_outside_cost))
1922 {
1923 dr0_info = first_store;
1924 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1925 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1926 }
1927 else
1928 {
1929 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1930 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1931 }
1932
1933 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1934 prologue_cost_vec.create (2);
1935 epilogue_cost_vec.create (2);
1936
1937 int dummy2;
1938 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1939 (loop_vinfo, estimated_npeels, &dummy2,
1940 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1941 &prologue_cost_vec, &epilogue_cost_vec);
1942
1943 prologue_cost_vec.release ();
1944 epilogue_cost_vec.release ();
1945
1946 peel_for_unknown_alignment.peel_info.count = 1
1947 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1948 }
1949
1950 peel_for_unknown_alignment.peel_info.npeel = 0;
1951 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1952
1953 best_peel = peel_for_unknown_alignment;
1954
1955 peel_for_known_alignment.inside_cost = INT_MAX;
1956 peel_for_known_alignment.outside_cost = INT_MAX;
1957 peel_for_known_alignment.peel_info.count = 0;
1958 peel_for_known_alignment.peel_info.dr_info = NULL;
1959
1960 if (do_peeling && one_misalignment_known)
1961 {
1962 /* Peeling is possible, but there is no data access that is not supported
1963 unless aligned. So we try to choose the best possible peeling from
1964 the hash table. */
1965 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1966 (&peeling_htab, loop_vinfo);
1967 }
1968
1969 /* Compare costs of peeling for known and unknown alignment. */
1970 if (peel_for_known_alignment.peel_info.dr_info != NULL
1971 && peel_for_unknown_alignment.inside_cost
1972 >= peel_for_known_alignment.inside_cost)
1973 {
1974 best_peel = peel_for_known_alignment;
1975
1976 /* If the best peeling for known alignment has NPEEL == 0, perform no
1977 peeling at all except if there is an unsupportable dr that we can
1978 align. */
1979 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1980 do_peeling = false;
1981 }
1982
1983 /* If there is an unsupportable data ref, prefer this over all choices so far
1984 since we'd have to discard a chosen peeling except when it accidentally
1985 aligned the unsupportable data ref. */
1986 if (one_dr_unsupportable)
1987 dr0_info = unsupportable_dr_info;
1988 else if (do_peeling)
1989 {
1990 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991 TODO: Use nopeel_outside_cost or get rid of it? */
1992 unsigned nopeel_inside_cost = 0;
1993 unsigned nopeel_outside_cost = 0;
1994
1995 stmt_vector_for_cost dummy;
1996 dummy.create (2);
1997 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1998 &nopeel_outside_cost, &dummy, &dummy,
1999 0, false);
2000 dummy.release ();
2001
2002 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2003 costs will be recorded. */
2004 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2005 prologue_cost_vec.create (2);
2006 epilogue_cost_vec.create (2);
2007
2008 int dummy2;
2009 nopeel_outside_cost += vect_get_known_peeling_cost
2010 (loop_vinfo, 0, &dummy2,
2011 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2012 &prologue_cost_vec, &epilogue_cost_vec);
2013
2014 prologue_cost_vec.release ();
2015 epilogue_cost_vec.release ();
2016
2017 npeel = best_peel.peel_info.npeel;
2018 dr0_info = best_peel.peel_info.dr_info;
2019
2020 /* If no peeling is not more expensive than the best peeling we
2021 have so far, don't perform any peeling. */
2022 if (nopeel_inside_cost <= best_peel.inside_cost)
2023 do_peeling = false;
2024 }
2025
2026 if (do_peeling)
2027 {
2028 stmt_vec_info stmt_info = dr0_info->stmt;
2029 vectype = STMT_VINFO_VECTYPE (stmt_info);
2030
2031 if (known_alignment_for_access_p (dr0_info))
2032 {
2033 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2034 size_zero_node) < 0;
2035 if (!npeel)
2036 {
2037 /* Since it's known at compile time, compute the number of
2038 iterations in the peeled loop (the peeling factor) for use in
2039 updating DR_MISALIGNMENT values. The peeling factor is the
2040 vectorization factor minus the misalignment as an element
2041 count. */
2042 mis = (negative
2043 ? DR_MISALIGNMENT (dr0_info)
2044 : -DR_MISALIGNMENT (dr0_info));
2045 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2046 npeel = ((mis & (target_align - 1))
2047 / vect_get_scalar_dr_size (dr0_info));
2048 }
2049
2050 /* For interleaved data access every iteration accesses all the
2051 members of the group, therefore we divide the number of iterations
2052 by the group size. */
2053 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054 npeel /= DR_GROUP_SIZE (stmt_info);
2055
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "Try peeling by %d\n", npeel);
2059 }
2060
2061 /* Ensure that all datarefs can be vectorized after the peel. */
2062 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2063 do_peeling = false;
2064
2065 /* Check if all datarefs are supportable and log. */
2066 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2067 {
2068 stat = vect_verify_datarefs_alignment (loop_vinfo);
2069 if (!stat)
2070 do_peeling = false;
2071 else
2072 return stat;
2073 }
2074
2075 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2076 if (do_peeling)
2077 {
2078 unsigned max_allowed_peel
2079 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2080 if (max_allowed_peel != (unsigned)-1)
2081 {
2082 unsigned max_peel = npeel;
2083 if (max_peel == 0)
2084 {
2085 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2086 max_peel = (target_align
2087 / vect_get_scalar_dr_size (dr0_info) - 1);
2088 }
2089 if (max_peel > max_allowed_peel)
2090 {
2091 do_peeling = false;
2092 if (dump_enabled_p ())
2093 dump_printf_loc (MSG_NOTE, vect_location,
2094 "Disable peeling, max peels reached: %d\n", max_peel);
2095 }
2096 }
2097 }
2098
2099 /* Cost model #2 - if peeling may result in a remaining loop not
2100 iterating enough to be vectorized then do not peel. Since this
2101 is a cost heuristic rather than a correctness decision, use the
2102 most likely runtime value for variable vectorization factors. */
2103 if (do_peeling
2104 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2105 {
2106 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2107 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2108 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2109 < assumed_vf + max_peel)
2110 do_peeling = false;
2111 }
2112
2113 if (do_peeling)
2114 {
2115 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2116 If the misalignment of DR_i is identical to that of dr0 then set
2117 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2118 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2119 by the peeling factor times the element size of DR_i (MOD the
2120 vectorization factor times the size). Otherwise, the
2121 misalignment of DR_i must be set to unknown. */
2122 FOR_EACH_VEC_ELT (datarefs, i, dr)
2123 if (dr != dr0_info->dr)
2124 {
2125 /* Strided accesses perform only component accesses, alignment
2126 is irrelevant for them. */
2127 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2128 stmt_info = dr_info->stmt;
2129 if (STMT_VINFO_STRIDED_P (stmt_info)
2130 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2131 continue;
2132
2133 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2134 }
2135
2136 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2137 if (npeel)
2138 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2139 else
2140 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2141 = DR_MISALIGNMENT (dr0_info);
2142 SET_DR_MISALIGNMENT (dr0_info, 0);
2143 if (dump_enabled_p ())
2144 {
2145 dump_printf_loc (MSG_NOTE, vect_location,
2146 "Alignment of access forced using peeling.\n");
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "Peeling for alignment will be applied.\n");
2149 }
2150
2151 /* The inside-loop cost will be accounted for in vectorizable_load
2152 and vectorizable_store correctly with adjusted alignments.
2153 Drop the body_cst_vec on the floor here. */
2154 stat = vect_verify_datarefs_alignment (loop_vinfo);
2155 gcc_assert (stat);
2156 return stat;
2157 }
2158 }
2159
2160 /* (2) Versioning to force alignment. */
2161
2162 /* Try versioning if:
2163 1) optimize loop for speed
2164 2) there is at least one unsupported misaligned data ref with an unknown
2165 misalignment, and
2166 3) all misaligned data refs with a known misalignment are supported, and
2167 4) the number of runtime alignment checks is within reason. */
2168
2169 do_versioning =
2170 optimize_loop_nest_for_speed_p (loop)
2171 && (!loop->inner); /* FORNOW */
2172
2173 if (do_versioning)
2174 {
2175 FOR_EACH_VEC_ELT (datarefs, i, dr)
2176 {
2177 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2178 stmt_vec_info stmt_info = dr_info->stmt;
2179
2180 /* For interleaving, only the alignment of the first access
2181 matters. */
2182 if (aligned_access_p (dr_info)
2183 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2184 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2185 continue;
2186
2187 if (STMT_VINFO_STRIDED_P (stmt_info))
2188 {
2189 /* Strided loads perform only component accesses, alignment is
2190 irrelevant for them. */
2191 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2192 continue;
2193 do_versioning = false;
2194 break;
2195 }
2196
2197 supportable_dr_alignment
2198 = vect_supportable_dr_alignment (dr_info, false);
2199
2200 if (!supportable_dr_alignment)
2201 {
2202 int mask;
2203 tree vectype;
2204
2205 if (known_alignment_for_access_p (dr_info)
2206 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2207 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2208 {
2209 do_versioning = false;
2210 break;
2211 }
2212
2213 vectype = STMT_VINFO_VECTYPE (stmt_info);
2214 gcc_assert (vectype);
2215
2216 /* At present we don't support versioning for alignment
2217 with variable VF, since there's no guarantee that the
2218 VF is a power of two. We could relax this if we added
2219 a way of enforcing a power-of-two size. */
2220 unsigned HOST_WIDE_INT size;
2221 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2222 {
2223 do_versioning = false;
2224 break;
2225 }
2226
2227 /* The rightmost bits of an aligned address must be zeros.
2228 Construct the mask needed for this test. For example,
2229 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2230 mask must be 15 = 0xf. */
2231 mask = size - 1;
2232
2233 /* FORNOW: use the same mask to test all potentially unaligned
2234 references in the loop. The vectorizer currently supports
2235 a single vector size, see the reference to
2236 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2237 vectorization factor is computed. */
2238 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2239 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2240 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2241 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2242 }
2243 }
2244
2245 /* Versioning requires at least one misaligned data reference. */
2246 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2247 do_versioning = false;
2248 else if (!do_versioning)
2249 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2250 }
2251
2252 if (do_versioning)
2253 {
2254 vec<stmt_vec_info> may_misalign_stmts
2255 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2256 stmt_vec_info stmt_info;
2257
2258 /* It can now be assumed that the data references in the statements
2259 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260 of the loop being vectorized. */
2261 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2262 {
2263 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2264 SET_DR_MISALIGNMENT (dr_info, 0);
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "Alignment of access forced using versioning.\n");
2268 }
2269
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "Versioning for alignment will be applied.\n");
2273
2274 /* Peeling and versioning can't be done together at this time. */
2275 gcc_assert (! (do_peeling && do_versioning));
2276
2277 stat = vect_verify_datarefs_alignment (loop_vinfo);
2278 gcc_assert (stat);
2279 return stat;
2280 }
2281
2282 /* This point is reached if neither peeling nor versioning is being done. */
2283 gcc_assert (! (do_peeling || do_versioning));
2284
2285 stat = vect_verify_datarefs_alignment (loop_vinfo);
2286 return stat;
2287 }
2288
2289
2290 /* Function vect_find_same_alignment_drs.
2291
2292 Update group and alignment relations in VINFO according to the chosen
2293 vectorization factor. */
2294
2295 static void
2296 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2297 {
2298 struct data_reference *dra = DDR_A (ddr);
2299 struct data_reference *drb = DDR_B (ddr);
2300 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2301 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2302 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2303 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2304
2305 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2306 return;
2307
2308 if (dra == drb)
2309 return;
2310
2311 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2312 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2313 return;
2314
2315 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2316 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2317 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2318 return;
2319
2320 /* Two references with distance zero have the same alignment. */
2321 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2322 - wi::to_poly_offset (DR_INIT (drb)));
2323 if (maybe_ne (diff, 0))
2324 {
2325 /* Get the wider of the two alignments. */
2326 unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2327 / BITS_PER_UNIT);
2328 unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2329 / BITS_PER_UNIT);
2330 unsigned int max_align = MAX (align_a, align_b);
2331
2332 /* Require the gap to be a multiple of the larger vector alignment. */
2333 if (!multiple_p (diff, max_align))
2334 return;
2335 }
2336
2337 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2338 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2339 if (dump_enabled_p ())
2340 {
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "accesses have the same alignment: ");
2343 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2344 dump_printf (MSG_NOTE, " and ");
2345 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2346 dump_printf (MSG_NOTE, "\n");
2347 }
2348 }
2349
2350
2351 /* Function vect_analyze_data_refs_alignment
2352
2353 Analyze the alignment of the data-references in the loop.
2354 Return FALSE if a data reference is found that cannot be vectorized. */
2355
2356 bool
2357 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2358 {
2359 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2360
2361 /* Mark groups of data references with same alignment using
2362 data dependence information. */
2363 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2364 struct data_dependence_relation *ddr;
2365 unsigned int i;
2366
2367 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2368 vect_find_same_alignment_drs (vinfo, ddr);
2369
2370 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2371 struct data_reference *dr;
2372
2373 vect_record_base_alignments (vinfo);
2374 FOR_EACH_VEC_ELT (datarefs, i, dr)
2375 {
2376 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2377 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2378 vect_compute_data_ref_alignment (dr_info);
2379 }
2380
2381 return true;
2382 }
2383
2384
2385 /* Analyze alignment of DRs of stmts in NODE. */
2386
2387 static bool
2388 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2389 {
2390 /* We vectorize from the first scalar stmt in the node unless
2391 the node is permuted in which case we start from the first
2392 element in the group. */
2393 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2394 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2395 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2396 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2397
2398 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2399 vect_compute_data_ref_alignment (dr_info);
2400 /* For creating the data-ref pointer we need alignment of the
2401 first element anyway. */
2402 if (dr_info != first_dr_info)
2403 vect_compute_data_ref_alignment (first_dr_info);
2404 if (! verify_data_ref_alignment (dr_info))
2405 {
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "not vectorized: bad data alignment in basic "
2409 "block.\n");
2410 return false;
2411 }
2412
2413 return true;
2414 }
2415
2416 /* Function vect_slp_analyze_instance_alignment
2417
2418 Analyze the alignment of the data-references in the SLP instance.
2419 Return FALSE if a data reference is found that cannot be vectorized. */
2420
2421 bool
2422 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2423 {
2424 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2425
2426 slp_tree node;
2427 unsigned i;
2428 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2429 if (! vect_slp_analyze_and_verify_node_alignment (node))
2430 return false;
2431
2432 node = SLP_INSTANCE_TREE (instance);
2433 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2434 && ! vect_slp_analyze_and_verify_node_alignment
2435 (SLP_INSTANCE_TREE (instance)))
2436 return false;
2437
2438 return true;
2439 }
2440
2441
2442 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2443 accesses of legal size, step, etc. Detect gaps, single element
2444 interleaving, and other special cases. Set grouped access info.
2445 Collect groups of strided stores for further use in SLP analysis.
2446 Worker for vect_analyze_group_access. */
2447
2448 static bool
2449 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2450 {
2451 data_reference *dr = dr_info->dr;
2452 tree step = DR_STEP (dr);
2453 tree scalar_type = TREE_TYPE (DR_REF (dr));
2454 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2455 stmt_vec_info stmt_info = dr_info->stmt;
2456 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2457 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2458 HOST_WIDE_INT dr_step = -1;
2459 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2460 bool slp_impossible = false;
2461
2462 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2463 size of the interleaving group (including gaps). */
2464 if (tree_fits_shwi_p (step))
2465 {
2466 dr_step = tree_to_shwi (step);
2467 /* Check that STEP is a multiple of type size. Otherwise there is
2468 a non-element-sized gap at the end of the group which we
2469 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2470 ??? As we can handle non-constant step fine here we should
2471 simply remove uses of DR_GROUP_GAP between the last and first
2472 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2473 simply not include that gap. */
2474 if ((dr_step % type_size) != 0)
2475 {
2476 if (dump_enabled_p ())
2477 {
2478 dump_printf_loc (MSG_NOTE, vect_location,
2479 "Step ");
2480 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2481 dump_printf (MSG_NOTE,
2482 " is not a multiple of the element size for ");
2483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2484 dump_printf (MSG_NOTE, "\n");
2485 }
2486 return false;
2487 }
2488 groupsize = absu_hwi (dr_step) / type_size;
2489 }
2490 else
2491 groupsize = 0;
2492
2493 /* Not consecutive access is possible only if it is a part of interleaving. */
2494 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2495 {
2496 /* Check if it this DR is a part of interleaving, and is a single
2497 element of the group that is accessed in the loop. */
2498
2499 /* Gaps are supported only for loads. STEP must be a multiple of the type
2500 size. */
2501 if (DR_IS_READ (dr)
2502 && (dr_step % type_size) == 0
2503 && groupsize > 0)
2504 {
2505 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2506 DR_GROUP_SIZE (stmt_info) = groupsize;
2507 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2508 if (dump_enabled_p ())
2509 {
2510 dump_printf_loc (MSG_NOTE, vect_location,
2511 "Detected single element interleaving ");
2512 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2513 dump_printf (MSG_NOTE, " step ");
2514 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2515 dump_printf (MSG_NOTE, "\n");
2516 }
2517
2518 return true;
2519 }
2520
2521 if (dump_enabled_p ())
2522 {
2523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524 "not consecutive access ");
2525 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2526 stmt_info->stmt, 0);
2527 }
2528
2529 if (bb_vinfo)
2530 {
2531 /* Mark the statement as unvectorizable. */
2532 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2533 return true;
2534 }
2535
2536 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2537 STMT_VINFO_STRIDED_P (stmt_info) = true;
2538 return true;
2539 }
2540
2541 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2542 {
2543 /* First stmt in the interleaving chain. Check the chain. */
2544 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2545 struct data_reference *data_ref = dr;
2546 unsigned int count = 1;
2547 tree prev_init = DR_INIT (data_ref);
2548 stmt_vec_info prev = stmt_info;
2549 HOST_WIDE_INT diff, gaps = 0;
2550
2551 /* By construction, all group members have INTEGER_CST DR_INITs. */
2552 while (next)
2553 {
2554 /* Skip same data-refs. In case that two or more stmts share
2555 data-ref (supported only for loads), we vectorize only the first
2556 stmt, and the rest get their vectorized loads from the first
2557 one. */
2558 if (!tree_int_cst_compare (DR_INIT (data_ref),
2559 DR_INIT (STMT_VINFO_DATA_REF (next))))
2560 {
2561 if (DR_IS_WRITE (data_ref))
2562 {
2563 if (dump_enabled_p ())
2564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565 "Two store stmts share the same dr.\n");
2566 return false;
2567 }
2568
2569 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571 "Two or more load stmts share the same dr.\n");
2572
2573 /* For load use the same data-ref load. */
2574 DR_GROUP_SAME_DR_STMT (next) = prev;
2575
2576 prev = next;
2577 next = DR_GROUP_NEXT_ELEMENT (next);
2578 continue;
2579 }
2580
2581 prev = next;
2582 data_ref = STMT_VINFO_DATA_REF (next);
2583
2584 /* All group members have the same STEP by construction. */
2585 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2586
2587 /* Check that the distance between two accesses is equal to the type
2588 size. Otherwise, we have gaps. */
2589 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2590 - TREE_INT_CST_LOW (prev_init)) / type_size;
2591 if (diff != 1)
2592 {
2593 /* FORNOW: SLP of accesses with gaps is not supported. */
2594 slp_impossible = true;
2595 if (DR_IS_WRITE (data_ref))
2596 {
2597 if (dump_enabled_p ())
2598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2599 "interleaved store with gaps\n");
2600 return false;
2601 }
2602
2603 gaps += diff - 1;
2604 }
2605
2606 last_accessed_element += diff;
2607
2608 /* Store the gap from the previous member of the group. If there is no
2609 gap in the access, DR_GROUP_GAP is always 1. */
2610 DR_GROUP_GAP (next) = diff;
2611
2612 prev_init = DR_INIT (data_ref);
2613 next = DR_GROUP_NEXT_ELEMENT (next);
2614 /* Count the number of data-refs in the chain. */
2615 count++;
2616 }
2617
2618 if (groupsize == 0)
2619 groupsize = count + gaps;
2620
2621 /* This could be UINT_MAX but as we are generating code in a very
2622 inefficient way we have to cap earlier. See PR78699 for example. */
2623 if (groupsize > 4096)
2624 {
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627 "group is too large\n");
2628 return false;
2629 }
2630
2631 /* Check that the size of the interleaving is equal to count for stores,
2632 i.e., that there are no gaps. */
2633 if (groupsize != count
2634 && !DR_IS_READ (dr))
2635 {
2636 groupsize = count;
2637 STMT_VINFO_STRIDED_P (stmt_info) = true;
2638 }
2639
2640 /* If there is a gap after the last load in the group it is the
2641 difference between the groupsize and the last accessed
2642 element.
2643 When there is no gap, this difference should be 0. */
2644 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2645
2646 DR_GROUP_SIZE (stmt_info) = groupsize;
2647 if (dump_enabled_p ())
2648 {
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Detected interleaving ");
2651 if (DR_IS_READ (dr))
2652 dump_printf (MSG_NOTE, "load ");
2653 else if (STMT_VINFO_STRIDED_P (stmt_info))
2654 dump_printf (MSG_NOTE, "strided store ");
2655 else
2656 dump_printf (MSG_NOTE, "store ");
2657 dump_printf (MSG_NOTE, "of size %u starting with ",
2658 (unsigned)groupsize);
2659 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2660 if (DR_GROUP_GAP (stmt_info) != 0)
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "There is a gap of %u elements after the group\n",
2663 DR_GROUP_GAP (stmt_info));
2664 }
2665
2666 /* SLP: create an SLP data structure for every interleaving group of
2667 stores for further analysis in vect_analyse_slp. */
2668 if (DR_IS_WRITE (dr) && !slp_impossible)
2669 {
2670 if (loop_vinfo)
2671 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2672 if (bb_vinfo)
2673 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2674 }
2675 }
2676
2677 return true;
2678 }
2679
2680 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2681 accesses of legal size, step, etc. Detect gaps, single element
2682 interleaving, and other special cases. Set grouped access info.
2683 Collect groups of strided stores for further use in SLP analysis. */
2684
2685 static bool
2686 vect_analyze_group_access (dr_vec_info *dr_info)
2687 {
2688 if (!vect_analyze_group_access_1 (dr_info))
2689 {
2690 /* Dissolve the group if present. */
2691 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2692 while (stmt_info)
2693 {
2694 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2695 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2696 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2697 stmt_info = next;
2698 }
2699 return false;
2700 }
2701 return true;
2702 }
2703
2704 /* Analyze the access pattern of the data-reference DR_INFO.
2705 In case of non-consecutive accesses call vect_analyze_group_access() to
2706 analyze groups of accesses. */
2707
2708 static bool
2709 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2710 {
2711 data_reference *dr = dr_info->dr;
2712 tree step = DR_STEP (dr);
2713 tree scalar_type = TREE_TYPE (DR_REF (dr));
2714 stmt_vec_info stmt_info = dr_info->stmt;
2715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716 struct loop *loop = NULL;
2717
2718 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719 return true;
2720
2721 if (loop_vinfo)
2722 loop = LOOP_VINFO_LOOP (loop_vinfo);
2723
2724 if (loop_vinfo && !step)
2725 {
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728 "bad data-ref access in loop\n");
2729 return false;
2730 }
2731
2732 /* Allow loads with zero step in inner-loop vectorization. */
2733 if (loop_vinfo && integer_zerop (step))
2734 {
2735 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736 if (!nested_in_vect_loop_p (loop, stmt_info))
2737 return DR_IS_READ (dr);
2738 /* Allow references with zero step for outer loops marked
2739 with pragma omp simd only - it guarantees absence of
2740 loop-carried dependencies between inner loop iterations. */
2741 if (loop->safelen < 2)
2742 {
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "zero step in inner loop of nest\n");
2746 return false;
2747 }
2748 }
2749
2750 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2751 {
2752 /* Interleaved accesses are not yet supported within outer-loop
2753 vectorization for references in the inner-loop. */
2754 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2755
2756 /* For the rest of the analysis we use the outer-loop step. */
2757 step = STMT_VINFO_DR_STEP (stmt_info);
2758 if (integer_zerop (step))
2759 {
2760 if (dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "zero step in outer loop.\n");
2763 return DR_IS_READ (dr);
2764 }
2765 }
2766
2767 /* Consecutive? */
2768 if (TREE_CODE (step) == INTEGER_CST)
2769 {
2770 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772 || (dr_step < 0
2773 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774 {
2775 /* Mark that it is not interleaving. */
2776 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777 return true;
2778 }
2779 }
2780
2781 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2782 {
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "grouped access in outer loop.\n");
2786 return false;
2787 }
2788
2789
2790 /* Assume this is a DR handled by non-constant strided load case. */
2791 if (TREE_CODE (step) != INTEGER_CST)
2792 return (STMT_VINFO_STRIDED_P (stmt_info)
2793 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794 || vect_analyze_group_access (dr_info)));
2795
2796 /* Not consecutive access - check if it's a part of interleaving group. */
2797 return vect_analyze_group_access (dr_info);
2798 }
2799
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801 suitable for grouping. */
2802
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 {
2806 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808 int cmp;
2809
2810 /* Stabilize sort. */
2811 if (dra == drb)
2812 return 0;
2813
2814 /* DRs in different loops never belong to the same group. */
2815 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817 if (loopa != loopb)
2818 return loopa->num < loopb->num ? -1 : 1;
2819
2820 /* Ordering of DRs according to base. */
2821 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822 DR_BASE_ADDRESS (drb));
2823 if (cmp != 0)
2824 return cmp;
2825
2826 /* And according to DR_OFFSET. */
2827 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828 if (cmp != 0)
2829 return cmp;
2830
2831 /* Put reads before writes. */
2832 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833 return DR_IS_READ (dra) ? -1 : 1;
2834
2835 /* Then sort after access size. */
2836 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838 if (cmp != 0)
2839 return cmp;
2840
2841 /* And after step. */
2842 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843 if (cmp != 0)
2844 return cmp;
2845
2846 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2847 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848 if (cmp == 0)
2849 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850 return cmp;
2851 }
2852
2853 /* If OP is the result of a conversion, return the unconverted value,
2854 otherwise return null. */
2855
2856 static tree
2857 strip_conversion (tree op)
2858 {
2859 if (TREE_CODE (op) != SSA_NAME)
2860 return NULL_TREE;
2861 gimple *stmt = SSA_NAME_DEF_STMT (op);
2862 if (!is_gimple_assign (stmt)
2863 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864 return NULL_TREE;
2865 return gimple_assign_rhs1 (stmt);
2866 }
2867
2868 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2869 and STMT2_INFO being in a single group. */
2870
2871 static bool
2872 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2873 {
2874 if (gimple_assign_single_p (stmt1_info->stmt))
2875 return gimple_assign_single_p (stmt2_info->stmt);
2876
2877 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2878 if (call1 && gimple_call_internal_p (call1))
2879 {
2880 /* Check for two masked loads or two masked stores. */
2881 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2882 if (!call2 || !gimple_call_internal_p (call2))
2883 return false;
2884 internal_fn ifn = gimple_call_internal_fn (call1);
2885 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2886 return false;
2887 if (ifn != gimple_call_internal_fn (call2))
2888 return false;
2889
2890 /* Check that the masks are the same. Cope with casts of masks,
2891 like those created by build_mask_conversion. */
2892 tree mask1 = gimple_call_arg (call1, 2);
2893 tree mask2 = gimple_call_arg (call2, 2);
2894 if (!operand_equal_p (mask1, mask2, 0))
2895 {
2896 mask1 = strip_conversion (mask1);
2897 if (!mask1)
2898 return false;
2899 mask2 = strip_conversion (mask2);
2900 if (!mask2)
2901 return false;
2902 if (!operand_equal_p (mask1, mask2, 0))
2903 return false;
2904 }
2905 return true;
2906 }
2907
2908 return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913 Analyze the access pattern of all the data references in the loop.
2914
2915 FORNOW: the only access pattern that is considered vectorizable is a
2916 simple step 1 (consecutive) access.
2917
2918 FORNOW: handle only arrays and pointer accesses. */
2919
2920 bool
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923 unsigned int i;
2924 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925 struct data_reference *dr;
2926
2927 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929 if (datarefs.is_empty ())
2930 return true;
2931
2932 /* Sort the array of datarefs to make building the interleaving chains
2933 linear. Don't modify the original vector's order, it is needed for
2934 determining what dependencies are reversed. */
2935 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936 datarefs_copy.qsort (dr_group_sort_cmp);
2937
2938 /* Build the interleaving chains. */
2939 for (i = 0; i < datarefs_copy.length () - 1;)
2940 {
2941 data_reference_p dra = datarefs_copy[i];
2942 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2943 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2944 stmt_vec_info lastinfo = NULL;
2945 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2946 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2947 {
2948 ++i;
2949 continue;
2950 }
2951 for (i = i + 1; i < datarefs_copy.length (); ++i)
2952 {
2953 data_reference_p drb = datarefs_copy[i];
2954 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2955 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2956 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2957 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2958 break;
2959
2960 /* ??? Imperfect sorting (non-compatible types, non-modulo
2961 accesses, same accesses) can lead to a group to be artificially
2962 split here as we don't just skip over those. If it really
2963 matters we can push those to a worklist and re-iterate
2964 over them. The we can just skip ahead to the next DR here. */
2965
2966 /* DRs in a different loop should not be put into the same
2967 interleaving group. */
2968 if (gimple_bb (DR_STMT (dra))->loop_father
2969 != gimple_bb (DR_STMT (drb))->loop_father)
2970 break;
2971
2972 /* Check that the data-refs have same first location (except init)
2973 and they are both either store or load (not load and store,
2974 not masked loads or stores). */
2975 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2976 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977 DR_BASE_ADDRESS (drb)) != 0
2978 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2979 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2980 break;
2981
2982 /* Check that the data-refs have the same constant size. */
2983 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2984 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2985 if (!tree_fits_uhwi_p (sza)
2986 || !tree_fits_uhwi_p (szb)
2987 || !tree_int_cst_equal (sza, szb))
2988 break;
2989
2990 /* Check that the data-refs have the same step. */
2991 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2992 break;
2993
2994 /* Check the types are compatible.
2995 ??? We don't distinguish this during sorting. */
2996 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2997 TREE_TYPE (DR_REF (drb))))
2998 break;
2999
3000 /* Check that the DR_INITs are compile-time constants. */
3001 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3002 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3003 break;
3004
3005 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3006 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3007 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3008 HOST_WIDE_INT init_prev
3009 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3010 gcc_assert (init_a <= init_b
3011 && init_a <= init_prev
3012 && init_prev <= init_b);
3013
3014 /* Do not place the same access in the interleaving chain twice. */
3015 if (init_b == init_prev)
3016 {
3017 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3018 < gimple_uid (DR_STMT (drb)));
3019 /* ??? For now we simply "drop" the later reference which is
3020 otherwise the same rather than finishing off this group.
3021 In the end we'd want to re-process duplicates forming
3022 multiple groups from the refs, likely by just collecting
3023 all candidates (including duplicates and split points
3024 below) in a vector and then process them together. */
3025 continue;
3026 }
3027
3028 /* If init_b == init_a + the size of the type * k, we have an
3029 interleaving, and DRA is accessed before DRB. */
3030 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3031 if (type_size_a == 0
3032 || (init_b - init_a) % type_size_a != 0)
3033 break;
3034
3035 /* If we have a store, the accesses are adjacent. This splits
3036 groups into chunks we support (we don't support vectorization
3037 of stores with gaps). */
3038 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3039 break;
3040
3041 /* If the step (if not zero or non-constant) is greater than the
3042 difference between data-refs' inits this splits groups into
3043 suitable sizes. */
3044 if (tree_fits_shwi_p (DR_STEP (dra)))
3045 {
3046 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3047 if (step != 0 && step <= (init_b - init_a))
3048 break;
3049 }
3050
3051 if (dump_enabled_p ())
3052 {
3053 dump_printf_loc (MSG_NOTE, vect_location,
3054 "Detected interleaving ");
3055 if (DR_IS_READ (dra))
3056 dump_printf (MSG_NOTE, "load ");
3057 else
3058 dump_printf (MSG_NOTE, "store ");
3059 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3060 dump_printf (MSG_NOTE, " and ");
3061 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3062 dump_printf (MSG_NOTE, "\n");
3063 }
3064
3065 /* Link the found element into the group list. */
3066 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3067 {
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3069 lastinfo = stmtinfo_a;
3070 }
3071 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3072 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3073 lastinfo = stmtinfo_b;
3074 }
3075 }
3076
3077 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3078 {
3079 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3080 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3081 && !vect_analyze_data_ref_access (dr_info))
3082 {
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 "not vectorized: complicated access pattern.\n");
3086
3087 if (is_a <bb_vec_info> (vinfo))
3088 {
3089 /* Mark the statement as not vectorizable. */
3090 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3091 continue;
3092 }
3093 else
3094 {
3095 datarefs_copy.release ();
3096 return false;
3097 }
3098 }
3099 }
3100
3101 datarefs_copy.release ();
3102 return true;
3103 }
3104
3105 /* Function vect_vfa_segment_size.
3106
3107 Input:
3108 DR_INFO: The data reference.
3109 LENGTH_FACTOR: segment length to consider.
3110
3111 Return a value suitable for the dr_with_seg_len::seg_len field.
3112 This is the "distance travelled" by the pointer from the first
3113 iteration in the segment to the last. Note that it does not include
3114 the size of the access; in effect it only describes the first byte. */
3115
3116 static tree
3117 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3118 {
3119 length_factor = size_binop (MINUS_EXPR,
3120 fold_convert (sizetype, length_factor),
3121 size_one_node);
3122 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3123 length_factor);
3124 }
3125
3126 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3127 gives the worst-case number of bytes covered by the segment. */
3128
3129 static unsigned HOST_WIDE_INT
3130 vect_vfa_access_size (dr_vec_info *dr_info)
3131 {
3132 stmt_vec_info stmt_vinfo = dr_info->stmt;
3133 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3134 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3135 unsigned HOST_WIDE_INT access_size = ref_size;
3136 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3137 {
3138 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3139 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3140 }
3141 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3142 && (vect_supportable_dr_alignment (dr_info, false)
3143 == dr_explicit_realign_optimized))
3144 {
3145 /* We might access a full vector's worth. */
3146 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3147 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3148 }
3149 return access_size;
3150 }
3151
3152 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3153 describes. */
3154
3155 static unsigned int
3156 vect_vfa_align (dr_vec_info *dr_info)
3157 {
3158 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3159 }
3160
3161 /* Function vect_no_alias_p.
3162
3163 Given data references A and B with equal base and offset, see whether
3164 the alias relation can be decided at compilation time. Return 1 if
3165 it can and the references alias, 0 if it can and the references do
3166 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3167 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3168 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3169
3170 static int
3171 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3172 tree segment_length_a, tree segment_length_b,
3173 unsigned HOST_WIDE_INT access_size_a,
3174 unsigned HOST_WIDE_INT access_size_b)
3175 {
3176 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3177 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3178 poly_uint64 const_length_a;
3179 poly_uint64 const_length_b;
3180
3181 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3182 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3183 [a, a+12) */
3184 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3185 {
3186 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3187 offset_a = (offset_a + access_size_a) - const_length_a;
3188 }
3189 else
3190 const_length_a = tree_to_poly_uint64 (segment_length_a);
3191 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3192 {
3193 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3194 offset_b = (offset_b + access_size_b) - const_length_b;
3195 }
3196 else
3197 const_length_b = tree_to_poly_uint64 (segment_length_b);
3198
3199 const_length_a += access_size_a;
3200 const_length_b += access_size_b;
3201
3202 if (ranges_known_overlap_p (offset_a, const_length_a,
3203 offset_b, const_length_b))
3204 return 1;
3205
3206 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3207 offset_b, const_length_b))
3208 return 0;
3209
3210 return -1;
3211 }
3212
3213 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3214 in DDR is >= VF. */
3215
3216 static bool
3217 dependence_distance_ge_vf (data_dependence_relation *ddr,
3218 unsigned int loop_depth, poly_uint64 vf)
3219 {
3220 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3221 || DDR_NUM_DIST_VECTS (ddr) == 0)
3222 return false;
3223
3224 /* If the dependence is exact, we should have limited the VF instead. */
3225 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3226
3227 unsigned int i;
3228 lambda_vector dist_v;
3229 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3230 {
3231 HOST_WIDE_INT dist = dist_v[loop_depth];
3232 if (dist != 0
3233 && !(dist > 0 && DDR_REVERSED_P (ddr))
3234 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3235 return false;
3236 }
3237
3238 if (dump_enabled_p ())
3239 {
3240 dump_printf_loc (MSG_NOTE, vect_location,
3241 "dependence distance between ");
3242 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3243 dump_printf (MSG_NOTE, " and ");
3244 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3245 dump_printf (MSG_NOTE, " is >= VF\n");
3246 }
3247
3248 return true;
3249 }
3250
3251 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3252
3253 static void
3254 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3255 {
3256 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3257 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3258 dump_printf (dump_kind, ") >= ");
3259 dump_dec (dump_kind, lower_bound.min_value);
3260 }
3261
3262 /* Record that the vectorized loop requires the vec_lower_bound described
3263 by EXPR, UNSIGNED_P and MIN_VALUE. */
3264
3265 static void
3266 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3267 poly_uint64 min_value)
3268 {
3269 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3270 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3271 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3272 {
3273 unsigned_p &= lower_bounds[i].unsigned_p;
3274 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3275 if (lower_bounds[i].unsigned_p != unsigned_p
3276 || maybe_lt (lower_bounds[i].min_value, min_value))
3277 {
3278 lower_bounds[i].unsigned_p = unsigned_p;
3279 lower_bounds[i].min_value = min_value;
3280 if (dump_enabled_p ())
3281 {
3282 dump_printf_loc (MSG_NOTE, vect_location,
3283 "updating run-time check to ");
3284 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3285 dump_printf (MSG_NOTE, "\n");
3286 }
3287 }
3288 return;
3289 }
3290
3291 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3292 if (dump_enabled_p ())
3293 {
3294 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3295 dump_lower_bound (MSG_NOTE, lower_bound);
3296 dump_printf (MSG_NOTE, "\n");
3297 }
3298 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3299 }
3300
3301 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3302 will span fewer than GAP bytes. */
3303
3304 static bool
3305 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3306 poly_int64 gap)
3307 {
3308 stmt_vec_info stmt_info = dr_info->stmt;
3309 HOST_WIDE_INT count
3310 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3311 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3312 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3313 return (estimated_poly_value (gap)
3314 <= count * vect_get_scalar_dr_size (dr_info));
3315 }
3316
3317 /* Return true if we know that there is no alias between DR_INFO_A and
3318 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3319 When returning true, set *LOWER_BOUND_OUT to this N. */
3320
3321 static bool
3322 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3323 poly_uint64 *lower_bound_out)
3324 {
3325 /* Check that there is a constant gap of known sign between DR_A
3326 and DR_B. */
3327 data_reference *dr_a = dr_info_a->dr;
3328 data_reference *dr_b = dr_info_b->dr;
3329 poly_int64 init_a, init_b;
3330 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3331 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3332 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3333 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3334 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3335 || !ordered_p (init_a, init_b))
3336 return false;
3337
3338 /* Sort DR_A and DR_B by the address they access. */
3339 if (maybe_lt (init_b, init_a))
3340 {
3341 std::swap (init_a, init_b);
3342 std::swap (dr_info_a, dr_info_b);
3343 std::swap (dr_a, dr_b);
3344 }
3345
3346 /* If the two accesses could be dependent within a scalar iteration,
3347 make sure that we'd retain their order. */
3348 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3349 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3350 return false;
3351
3352 /* There is no alias if abs (DR_STEP) is greater than or equal to
3353 the bytes spanned by the combination of the two accesses. */
3354 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3355 return true;
3356 }
3357
3358 /* Function vect_prune_runtime_alias_test_list.
3359
3360 Prune a list of ddrs to be tested at run-time by versioning for alias.
3361 Merge several alias checks into one if possible.
3362 Return FALSE if resulting list of ddrs is longer then allowed by
3363 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3364
3365 bool
3366 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3367 {
3368 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3369 hash_set <tree_pair_hash> compared_objects;
3370
3371 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3372 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3373 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3374 vec<vec_object_pair> &check_unequal_addrs
3375 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3376 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3377 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3378
3379 ddr_p ddr;
3380 unsigned int i;
3381 tree length_factor;
3382
3383 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3384
3385 /* Step values are irrelevant for aliasing if the number of vector
3386 iterations is equal to the number of scalar iterations (which can
3387 happen for fully-SLP loops). */
3388 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3389
3390 if (!ignore_step_p)
3391 {
3392 /* Convert the checks for nonzero steps into bound tests. */
3393 tree value;
3394 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3395 vect_check_lower_bound (loop_vinfo, value, true, 1);
3396 }
3397
3398 if (may_alias_ddrs.is_empty ())
3399 return true;
3400
3401 comp_alias_ddrs.create (may_alias_ddrs.length ());
3402
3403 unsigned int loop_depth
3404 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3405 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3406
3407 /* First, we collect all data ref pairs for aliasing checks. */
3408 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3409 {
3410 int comp_res;
3411 poly_uint64 lower_bound;
3412 tree segment_length_a, segment_length_b;
3413 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3414 unsigned int align_a, align_b;
3415
3416 /* Ignore the alias if the VF we chose ended up being no greater
3417 than the dependence distance. */
3418 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3419 continue;
3420
3421 if (DDR_OBJECT_A (ddr))
3422 {
3423 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3424 if (!compared_objects.add (new_pair))
3425 {
3426 if (dump_enabled_p ())
3427 {
3428 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3429 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3430 dump_printf (MSG_NOTE, " and ");
3431 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3432 dump_printf (MSG_NOTE, " have different addresses\n");
3433 }
3434 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3435 }
3436 continue;
3437 }
3438
3439 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3440 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3441
3442 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3443 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3444
3445 /* Skip the pair if inter-iteration dependencies are irrelevant
3446 and intra-iteration dependencies are guaranteed to be honored. */
3447 if (ignore_step_p
3448 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3449 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3450 &lower_bound)))
3451 {
3452 if (dump_enabled_p ())
3453 {
3454 dump_printf_loc (MSG_NOTE, vect_location,
3455 "no need for alias check between ");
3456 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3457 dump_printf (MSG_NOTE, " and ");
3458 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3459 dump_printf (MSG_NOTE, " when VF is 1\n");
3460 }
3461 continue;
3462 }
3463
3464 /* See whether we can handle the alias using a bounds check on
3465 the step, and whether that's likely to be the best approach.
3466 (It might not be, for example, if the minimum step is much larger
3467 than the number of bytes handled by one vector iteration.) */
3468 if (!ignore_step_p
3469 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3470 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3471 &lower_bound)
3472 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3473 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3474 {
3475 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3476 if (dump_enabled_p ())
3477 {
3478 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3480 dump_printf (MSG_NOTE, " and ");
3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3482 dump_printf (MSG_NOTE, " when the step ");
3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3484 dump_printf (MSG_NOTE, " is outside ");
3485 if (unsigned_p)
3486 dump_printf (MSG_NOTE, "[0");
3487 else
3488 {
3489 dump_printf (MSG_NOTE, "(");
3490 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3491 }
3492 dump_printf (MSG_NOTE, ", ");
3493 dump_dec (MSG_NOTE, lower_bound);
3494 dump_printf (MSG_NOTE, ")\n");
3495 }
3496 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3497 unsigned_p, lower_bound);
3498 continue;
3499 }
3500
3501 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3502 if (dr_group_first_a)
3503 {
3504 stmt_info_a = dr_group_first_a;
3505 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3506 }
3507
3508 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3509 if (dr_group_first_b)
3510 {
3511 stmt_info_b = dr_group_first_b;
3512 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3513 }
3514
3515 if (ignore_step_p)
3516 {
3517 segment_length_a = size_zero_node;
3518 segment_length_b = size_zero_node;
3519 }
3520 else
3521 {
3522 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3523 DR_STEP (dr_info_b->dr), 0))
3524 length_factor = scalar_loop_iters;
3525 else
3526 length_factor = size_int (vect_factor);
3527 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3528 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3529 }
3530 access_size_a = vect_vfa_access_size (dr_info_a);
3531 access_size_b = vect_vfa_access_size (dr_info_b);
3532 align_a = vect_vfa_align (dr_info_a);
3533 align_b = vect_vfa_align (dr_info_b);
3534
3535 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3536 DR_BASE_ADDRESS (dr_info_b->dr));
3537 if (comp_res == 0)
3538 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3539 DR_OFFSET (dr_info_b->dr));
3540
3541 /* See whether the alias is known at compilation time. */
3542 if (comp_res == 0
3543 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3544 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3545 && poly_int_tree_p (segment_length_a)
3546 && poly_int_tree_p (segment_length_b))
3547 {
3548 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3549 segment_length_a,
3550 segment_length_b,
3551 access_size_a,
3552 access_size_b);
3553 if (res >= 0 && dump_enabled_p ())
3554 {
3555 dump_printf_loc (MSG_NOTE, vect_location,
3556 "can tell at compile time that ");
3557 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3558 dump_printf (MSG_NOTE, " and ");
3559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3560 if (res == 0)
3561 dump_printf (MSG_NOTE, " do not alias\n");
3562 else
3563 dump_printf (MSG_NOTE, " alias\n");
3564 }
3565
3566 if (res == 0)
3567 continue;
3568
3569 if (res == 1)
3570 {
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_NOTE, vect_location,
3573 "not vectorized: compilation time alias.\n");
3574 return false;
3575 }
3576 }
3577
3578 dr_with_seg_len_pair_t dr_with_seg_len_pair
3579 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3580 access_size_a, align_a),
3581 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3582 access_size_b, align_b));
3583
3584 /* Canonicalize pairs by sorting the two DR members. */
3585 if (comp_res > 0)
3586 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3587
3588 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3589 }
3590
3591 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3592
3593 unsigned int count = (comp_alias_ddrs.length ()
3594 + check_unequal_addrs.length ());
3595
3596 dump_printf_loc (MSG_NOTE, vect_location,
3597 "improved number of alias checks from %d to %d\n",
3598 may_alias_ddrs.length (), count);
3599 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3600 {
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603 "number of versioning for alias "
3604 "run-time tests exceeds %d "
3605 "(--param vect-max-version-for-alias-checks)\n",
3606 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3607 return false;
3608 }
3609
3610 return true;
3611 }
3612
3613 /* Check whether we can use an internal function for a gather load
3614 or scatter store. READ_P is true for loads and false for stores.
3615 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3616 the type of the memory elements being loaded or stored. OFFSET_BITS
3617 is the number of bits in each scalar offset and OFFSET_SIGN is the
3618 sign of the offset. SCALE is the amount by which the offset should
3619 be multiplied *after* it has been converted to address width.
3620
3621 Return true if the function is supported, storing the function
3622 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3623
3624 bool
3625 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3626 tree memory_type, unsigned int offset_bits,
3627 signop offset_sign, int scale,
3628 internal_fn *ifn_out, tree *element_type_out)
3629 {
3630 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3631 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3632 if (offset_bits > element_bits)
3633 /* Internal functions require the offset to be the same width as
3634 the vector elements. We can extend narrower offsets, but it isn't
3635 safe to truncate wider offsets. */
3636 return false;
3637
3638 if (element_bits != memory_bits)
3639 /* For now the vector elements must be the same width as the
3640 memory elements. */
3641 return false;
3642
3643 /* Work out which function we need. */
3644 internal_fn ifn;
3645 if (read_p)
3646 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3647 else
3648 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3649
3650 /* Test whether the target supports this combination. */
3651 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3652 offset_sign, scale))
3653 return false;
3654
3655 *ifn_out = ifn;
3656 *element_type_out = TREE_TYPE (vectype);
3657 return true;
3658 }
3659
3660 /* STMT_INFO is a call to an internal gather load or scatter store function.
3661 Describe the operation in INFO. */
3662
3663 static void
3664 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3665 gather_scatter_info *info)
3666 {
3667 gcall *call = as_a <gcall *> (stmt_info->stmt);
3668 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3669 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3670
3671 info->ifn = gimple_call_internal_fn (call);
3672 info->decl = NULL_TREE;
3673 info->base = gimple_call_arg (call, 0);
3674 info->offset = gimple_call_arg (call, 1);
3675 info->offset_dt = vect_unknown_def_type;
3676 info->offset_vectype = NULL_TREE;
3677 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3678 info->element_type = TREE_TYPE (vectype);
3679 info->memory_type = TREE_TYPE (DR_REF (dr));
3680 }
3681
3682 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3683 gather load or scatter store. Describe the operation in *INFO if so. */
3684
3685 bool
3686 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3687 gather_scatter_info *info)
3688 {
3689 HOST_WIDE_INT scale = 1;
3690 poly_int64 pbitpos, pbitsize;
3691 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3692 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3693 tree offtype = NULL_TREE;
3694 tree decl = NULL_TREE, base, off;
3695 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3696 tree memory_type = TREE_TYPE (DR_REF (dr));
3697 machine_mode pmode;
3698 int punsignedp, reversep, pvolatilep = 0;
3699 internal_fn ifn;
3700 tree element_type;
3701 bool masked_p = false;
3702
3703 /* See whether this is already a call to a gather/scatter internal function.
3704 If not, see whether it's a masked load or store. */
3705 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3706 if (call && gimple_call_internal_p (call))
3707 {
3708 ifn = gimple_call_internal_fn (call);
3709 if (internal_gather_scatter_fn_p (ifn))
3710 {
3711 vect_describe_gather_scatter_call (stmt_info, info);
3712 return true;
3713 }
3714 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3715 }
3716
3717 /* True if we should aim to use internal functions rather than
3718 built-in functions. */
3719 bool use_ifn_p = (DR_IS_READ (dr)
3720 ? supports_vec_gather_load_p ()
3721 : supports_vec_scatter_store_p ());
3722
3723 base = DR_REF (dr);
3724 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3725 see if we can use the def stmt of the address. */
3726 if (masked_p
3727 && TREE_CODE (base) == MEM_REF
3728 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3729 && integer_zerop (TREE_OPERAND (base, 1))
3730 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3731 {
3732 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3733 if (is_gimple_assign (def_stmt)
3734 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3735 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3736 }
3737
3738 /* The gather and scatter builtins need address of the form
3739 loop_invariant + vector * {1, 2, 4, 8}
3740 or
3741 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3742 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3743 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3744 multiplications and additions in it. To get a vector, we need
3745 a single SSA_NAME that will be defined in the loop and will
3746 contain everything that is not loop invariant and that can be
3747 vectorized. The following code attempts to find such a preexistng
3748 SSA_NAME OFF and put the loop invariants into a tree BASE
3749 that can be gimplified before the loop. */
3750 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3751 &punsignedp, &reversep, &pvolatilep);
3752 if (reversep)
3753 return false;
3754
3755 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3756
3757 if (TREE_CODE (base) == MEM_REF)
3758 {
3759 if (!integer_zerop (TREE_OPERAND (base, 1)))
3760 {
3761 if (off == NULL_TREE)
3762 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3763 else
3764 off = size_binop (PLUS_EXPR, off,
3765 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3766 }
3767 base = TREE_OPERAND (base, 0);
3768 }
3769 else
3770 base = build_fold_addr_expr (base);
3771
3772 if (off == NULL_TREE)
3773 off = size_zero_node;
3774
3775 /* If base is not loop invariant, either off is 0, then we start with just
3776 the constant offset in the loop invariant BASE and continue with base
3777 as OFF, otherwise give up.
3778 We could handle that case by gimplifying the addition of base + off
3779 into some SSA_NAME and use that as off, but for now punt. */
3780 if (!expr_invariant_in_loop_p (loop, base))
3781 {
3782 if (!integer_zerop (off))
3783 return false;
3784 off = base;
3785 base = size_int (pbytepos);
3786 }
3787 /* Otherwise put base + constant offset into the loop invariant BASE
3788 and continue with OFF. */
3789 else
3790 {
3791 base = fold_convert (sizetype, base);
3792 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3793 }
3794
3795 /* OFF at this point may be either a SSA_NAME or some tree expression
3796 from get_inner_reference. Try to peel off loop invariants from it
3797 into BASE as long as possible. */
3798 STRIP_NOPS (off);
3799 while (offtype == NULL_TREE)
3800 {
3801 enum tree_code code;
3802 tree op0, op1, add = NULL_TREE;
3803
3804 if (TREE_CODE (off) == SSA_NAME)
3805 {
3806 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3807
3808 if (expr_invariant_in_loop_p (loop, off))
3809 return false;
3810
3811 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3812 break;
3813
3814 op0 = gimple_assign_rhs1 (def_stmt);
3815 code = gimple_assign_rhs_code (def_stmt);
3816 op1 = gimple_assign_rhs2 (def_stmt);
3817 }
3818 else
3819 {
3820 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3821 return false;
3822 code = TREE_CODE (off);
3823 extract_ops_from_tree (off, &code, &op0, &op1);
3824 }
3825 switch (code)
3826 {
3827 case POINTER_PLUS_EXPR:
3828 case PLUS_EXPR:
3829 if (expr_invariant_in_loop_p (loop, op0))
3830 {
3831 add = op0;
3832 off = op1;
3833 do_add:
3834 add = fold_convert (sizetype, add);
3835 if (scale != 1)
3836 add = size_binop (MULT_EXPR, add, size_int (scale));
3837 base = size_binop (PLUS_EXPR, base, add);
3838 continue;
3839 }
3840 if (expr_invariant_in_loop_p (loop, op1))
3841 {
3842 add = op1;
3843 off = op0;
3844 goto do_add;
3845 }
3846 break;
3847 case MINUS_EXPR:
3848 if (expr_invariant_in_loop_p (loop, op1))
3849 {
3850 add = fold_convert (sizetype, op1);
3851 add = size_binop (MINUS_EXPR, size_zero_node, add);
3852 off = op0;
3853 goto do_add;
3854 }
3855 break;
3856 case MULT_EXPR:
3857 if (scale == 1 && tree_fits_shwi_p (op1))
3858 {
3859 int new_scale = tree_to_shwi (op1);
3860 /* Only treat this as a scaling operation if the target
3861 supports it. */
3862 if (use_ifn_p
3863 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3864 vectype, memory_type, 1,
3865 TYPE_SIGN (TREE_TYPE (op0)),
3866 new_scale, &ifn,
3867 &element_type))
3868 break;
3869 scale = new_scale;
3870 off = op0;
3871 continue;
3872 }
3873 break;
3874 case SSA_NAME:
3875 off = op0;
3876 continue;
3877 CASE_CONVERT:
3878 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3879 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3880 break;
3881 if (TYPE_PRECISION (TREE_TYPE (op0))
3882 == TYPE_PRECISION (TREE_TYPE (off)))
3883 {
3884 off = op0;
3885 continue;
3886 }
3887
3888 /* The internal functions need the offset to be the same width
3889 as the elements of VECTYPE. Don't include operations that
3890 cast the offset from that width to a different width. */
3891 if (use_ifn_p
3892 && (int_size_in_bytes (TREE_TYPE (vectype))
3893 == int_size_in_bytes (TREE_TYPE (off))))
3894 break;
3895
3896 if (TYPE_PRECISION (TREE_TYPE (op0))
3897 < TYPE_PRECISION (TREE_TYPE (off)))
3898 {
3899 off = op0;
3900 offtype = TREE_TYPE (off);
3901 STRIP_NOPS (off);
3902 continue;
3903 }
3904 break;
3905 default:
3906 break;
3907 }
3908 break;
3909 }
3910
3911 /* If at the end OFF still isn't a SSA_NAME or isn't
3912 defined in the loop, punt. */
3913 if (TREE_CODE (off) != SSA_NAME
3914 || expr_invariant_in_loop_p (loop, off))
3915 return false;
3916
3917 if (offtype == NULL_TREE)
3918 offtype = TREE_TYPE (off);
3919
3920 if (use_ifn_p)
3921 {
3922 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3923 memory_type, TYPE_PRECISION (offtype),
3924 TYPE_SIGN (offtype), scale, &ifn,
3925 &element_type))
3926 return false;
3927 }
3928 else
3929 {
3930 if (DR_IS_READ (dr))
3931 {
3932 if (targetm.vectorize.builtin_gather)
3933 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3934 }
3935 else
3936 {
3937 if (targetm.vectorize.builtin_scatter)
3938 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3939 }
3940
3941 if (!decl)
3942 return false;
3943
3944 ifn = IFN_LAST;
3945 element_type = TREE_TYPE (vectype);
3946 }
3947
3948 info->ifn = ifn;
3949 info->decl = decl;
3950 info->base = base;
3951 info->offset = off;
3952 info->offset_dt = vect_unknown_def_type;
3953 info->offset_vectype = NULL_TREE;
3954 info->scale = scale;
3955 info->element_type = element_type;
3956 info->memory_type = memory_type;
3957 return true;
3958 }
3959
3960 /* Find the data references in STMT, analyze them with respect to LOOP and
3961 append them to DATAREFS. Return false if datarefs in this stmt cannot
3962 be handled. */
3963
3964 bool
3965 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3966 vec<data_reference_p> *datarefs)
3967 {
3968 /* We can ignore clobbers for dataref analysis - they are removed during
3969 loop vectorization and BB vectorization checks dependences with a
3970 stmt walk. */
3971 if (gimple_clobber_p (stmt))
3972 return true;
3973
3974 if (gimple_has_volatile_ops (stmt))
3975 {
3976 if (dump_enabled_p ())
3977 {
3978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3979 "not vectorized: volatile type ");
3980 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3981 }
3982 return false;
3983 }
3984
3985 if (stmt_can_throw_internal (stmt))
3986 {
3987 if (dump_enabled_p ())
3988 {
3989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990 "not vectorized: statement can throw an "
3991 "exception ");
3992 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3993 }
3994 return false;
3995 }
3996
3997 auto_vec<data_reference_p, 2> refs;
3998 if (!find_data_references_in_stmt (loop, stmt, &refs))
3999 return false;
4000
4001 if (refs.is_empty ())
4002 return true;
4003
4004 if (refs.length () > 1)
4005 {
4006 if (dump_enabled_p ())
4007 {
4008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4009 "not vectorized: more than one data ref "
4010 "in stmt: ");
4011 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4012 }
4013 return false;
4014 }
4015
4016 if (gcall *call = dyn_cast <gcall *> (stmt))
4017 if (!gimple_call_internal_p (call)
4018 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4019 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4020 {
4021 if (dump_enabled_p ())
4022 {
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4024 "not vectorized: dr in a call ");
4025 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4026 }
4027 return false;
4028 }
4029
4030 data_reference_p dr = refs.pop ();
4031 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4032 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4033 {
4034 if (dump_enabled_p ())
4035 {
4036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037 "not vectorized: statement is bitfield "
4038 "access ");
4039 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4040 }
4041 return false;
4042 }
4043
4044 if (DR_BASE_ADDRESS (dr)
4045 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4046 {
4047 if (dump_enabled_p ())
4048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049 "not vectorized: base addr of dr is a "
4050 "constant\n");
4051 return false;
4052 }
4053
4054 /* Check whether this may be a SIMD lane access and adjust the
4055 DR to make it easier for us to handle it. */
4056 if (loop
4057 && loop->simduid
4058 && (!DR_BASE_ADDRESS (dr)
4059 || !DR_OFFSET (dr)
4060 || !DR_INIT (dr)
4061 || !DR_STEP (dr)))
4062 {
4063 struct data_reference *newdr
4064 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4065 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4066 if (DR_BASE_ADDRESS (newdr)
4067 && DR_OFFSET (newdr)
4068 && DR_INIT (newdr)
4069 && DR_STEP (newdr)
4070 && integer_zerop (DR_STEP (newdr)))
4071 {
4072 tree off = DR_OFFSET (newdr);
4073 STRIP_NOPS (off);
4074 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4075 && TREE_CODE (off) == MULT_EXPR
4076 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4077 {
4078 tree step = TREE_OPERAND (off, 1);
4079 off = TREE_OPERAND (off, 0);
4080 STRIP_NOPS (off);
4081 if (CONVERT_EXPR_P (off)
4082 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4083 < TYPE_PRECISION (TREE_TYPE (off))))
4084 off = TREE_OPERAND (off, 0);
4085 if (TREE_CODE (off) == SSA_NAME)
4086 {
4087 gimple *def = SSA_NAME_DEF_STMT (off);
4088 tree reft = TREE_TYPE (DR_REF (newdr));
4089 if (is_gimple_call (def)
4090 && gimple_call_internal_p (def)
4091 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4092 {
4093 tree arg = gimple_call_arg (def, 0);
4094 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4095 arg = SSA_NAME_VAR (arg);
4096 if (arg == loop->simduid
4097 /* For now. */
4098 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4099 {
4100 DR_OFFSET (newdr) = ssize_int (0);
4101 DR_STEP (newdr) = step;
4102 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4103 DR_STEP_ALIGNMENT (newdr)
4104 = highest_pow2_factor (step);
4105 /* Mark as simd-lane access. */
4106 newdr->aux = (void *)-1;
4107 free_data_ref (dr);
4108 datarefs->safe_push (newdr);
4109 return true;
4110 }
4111 }
4112 }
4113 }
4114 }
4115 free_data_ref (newdr);
4116 }
4117
4118 datarefs->safe_push (dr);
4119 return true;
4120 }
4121
4122 /* Function vect_analyze_data_refs.
4123
4124 Find all the data references in the loop or basic block.
4125
4126 The general structure of the analysis of data refs in the vectorizer is as
4127 follows:
4128 1- vect_analyze_data_refs(loop/bb): call
4129 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4130 in the loop/bb and their dependences.
4131 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4132 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4133 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4134
4135 */
4136
4137 bool
4138 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4139 {
4140 struct loop *loop = NULL;
4141 unsigned int i;
4142 struct data_reference *dr;
4143 tree scalar_type;
4144
4145 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4146
4147 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4148 loop = LOOP_VINFO_LOOP (loop_vinfo);
4149
4150 /* Go through the data-refs, check that the analysis succeeded. Update
4151 pointer from stmt_vec_info struct to DR and vectype. */
4152
4153 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4154 FOR_EACH_VEC_ELT (datarefs, i, dr)
4155 {
4156 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4157 poly_uint64 vf;
4158
4159 gcc_assert (DR_REF (dr));
4160 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4161 gcc_assert (!stmt_info->dr_aux.dr);
4162 stmt_info->dr_aux.dr = dr;
4163 stmt_info->dr_aux.stmt = stmt_info;
4164
4165 /* Check that analysis of the data-ref succeeded. */
4166 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4167 || !DR_STEP (dr))
4168 {
4169 bool maybe_gather
4170 = DR_IS_READ (dr)
4171 && !TREE_THIS_VOLATILE (DR_REF (dr))
4172 && (targetm.vectorize.builtin_gather != NULL
4173 || supports_vec_gather_load_p ());
4174 bool maybe_scatter
4175 = DR_IS_WRITE (dr)
4176 && !TREE_THIS_VOLATILE (DR_REF (dr))
4177 && (targetm.vectorize.builtin_scatter != NULL
4178 || supports_vec_scatter_store_p ());
4179
4180 /* If target supports vector gather loads or scatter stores,
4181 see if they can't be used. */
4182 if (is_a <loop_vec_info> (vinfo)
4183 && !nested_in_vect_loop_p (loop, stmt_info))
4184 {
4185 if (maybe_gather || maybe_scatter)
4186 {
4187 if (maybe_gather)
4188 gatherscatter = GATHER;
4189 else
4190 gatherscatter = SCATTER;
4191 }
4192 }
4193
4194 if (gatherscatter == SG_NONE)
4195 {
4196 if (dump_enabled_p ())
4197 {
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "not vectorized: data ref analysis "
4200 "failed ");
4201 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4202 stmt_info->stmt, 0);
4203 }
4204 if (is_a <bb_vec_info> (vinfo))
4205 {
4206 /* In BB vectorization the ref can still participate
4207 in dependence analysis, we just can't vectorize it. */
4208 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209 continue;
4210 }
4211 return false;
4212 }
4213 }
4214
4215 /* See if this was detected as SIMD lane access. */
4216 if (dr->aux == (void *)-1)
4217 {
4218 if (nested_in_vect_loop_p (loop, stmt_info))
4219 {
4220 if (dump_enabled_p ())
4221 {
4222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 "not vectorized: data ref analysis "
4224 "failed ");
4225 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4226 stmt_info->stmt, 0);
4227 }
4228 return false;
4229 }
4230 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4231 }
4232
4233 tree base = get_base_address (DR_REF (dr));
4234 if (base && VAR_P (base) && DECL_NONALIASED (base))
4235 {
4236 if (dump_enabled_p ())
4237 {
4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239 "not vectorized: base object not addressable "
4240 "for stmt: ");
4241 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4242 stmt_info->stmt, 0);
4243 }
4244 if (is_a <bb_vec_info> (vinfo))
4245 {
4246 /* In BB vectorization the ref can still participate
4247 in dependence analysis, we just can't vectorize it. */
4248 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4249 continue;
4250 }
4251 return false;
4252 }
4253
4254 if (is_a <loop_vec_info> (vinfo)
4255 && DR_STEP (dr)
4256 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4257 {
4258 if (nested_in_vect_loop_p (loop, stmt_info))
4259 {
4260 if (dump_enabled_p ())
4261 {
4262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4263 "not vectorized: not suitable for strided "
4264 "load ");
4265 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4266 stmt_info->stmt, 0);
4267 }
4268 return false;
4269 }
4270 STMT_VINFO_STRIDED_P (stmt_info) = true;
4271 }
4272
4273 /* Update DR field in stmt_vec_info struct. */
4274
4275 /* If the dataref is in an inner-loop of the loop that is considered for
4276 for vectorization, we also want to analyze the access relative to
4277 the outer-loop (DR contains information only relative to the
4278 inner-most enclosing loop). We do that by building a reference to the
4279 first location accessed by the inner-loop, and analyze it relative to
4280 the outer-loop. */
4281 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4282 {
4283 /* Build a reference to the first location accessed by the
4284 inner loop: *(BASE + INIT + OFFSET). By construction,
4285 this address must be invariant in the inner loop, so we
4286 can consider it as being used in the outer loop. */
4287 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4288 tree offset = unshare_expr (DR_OFFSET (dr));
4289 tree init = unshare_expr (DR_INIT (dr));
4290 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4291 init, offset);
4292 tree init_addr = fold_build_pointer_plus (base, init_offset);
4293 tree init_ref = build_fold_indirect_ref (init_addr);
4294
4295 if (dump_enabled_p ())
4296 {
4297 dump_printf_loc (MSG_NOTE, vect_location,
4298 "analyze in outer loop: ");
4299 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4300 dump_printf (MSG_NOTE, "\n");
4301 }
4302
4303 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4304 init_ref, loop))
4305 /* dr_analyze_innermost already explained the failure. */
4306 return false;
4307
4308 if (dump_enabled_p ())
4309 {
4310 dump_printf_loc (MSG_NOTE, vect_location,
4311 "\touter base_address: ");
4312 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4313 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4314 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4315 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4316 STMT_VINFO_DR_OFFSET (stmt_info));
4317 dump_printf (MSG_NOTE,
4318 "\n\touter constant offset from base address: ");
4319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4320 STMT_VINFO_DR_INIT (stmt_info));
4321 dump_printf (MSG_NOTE, "\n\touter step: ");
4322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4323 STMT_VINFO_DR_STEP (stmt_info));
4324 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4325 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4326 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4327 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4328 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4329 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4330 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4331 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4332 }
4333 }
4334
4335 /* Set vectype for STMT. */
4336 scalar_type = TREE_TYPE (DR_REF (dr));
4337 STMT_VINFO_VECTYPE (stmt_info)
4338 = get_vectype_for_scalar_type (scalar_type);
4339 if (!STMT_VINFO_VECTYPE (stmt_info))
4340 {
4341 if (dump_enabled_p ())
4342 {
4343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4344 "not vectorized: no vectype for stmt: ");
4345 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4346 stmt_info->stmt, 0);
4347 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4349 scalar_type);
4350 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4351 }
4352
4353 if (is_a <bb_vec_info> (vinfo))
4354 {
4355 /* No vector type is fine, the ref can still participate
4356 in dependence analysis, we just can't vectorize it. */
4357 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4358 continue;
4359 }
4360 return false;
4361 }
4362 else
4363 {
4364 if (dump_enabled_p ())
4365 {
4366 dump_printf_loc (MSG_NOTE, vect_location,
4367 "got vectype for stmt: ");
4368 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4369 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4370 STMT_VINFO_VECTYPE (stmt_info));
4371 dump_printf (MSG_NOTE, "\n");
4372 }
4373 }
4374
4375 /* Adjust the minimal vectorization factor according to the
4376 vector type. */
4377 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4378 *min_vf = upper_bound (*min_vf, vf);
4379
4380 if (gatherscatter != SG_NONE)
4381 {
4382 gather_scatter_info gs_info;
4383 if (!vect_check_gather_scatter (stmt_info,
4384 as_a <loop_vec_info> (vinfo),
4385 &gs_info)
4386 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4387 {
4388 if (dump_enabled_p ())
4389 {
4390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4391 (gatherscatter == GATHER) ?
4392 "not vectorized: not suitable for gather "
4393 "load " :
4394 "not vectorized: not suitable for scatter "
4395 "store ");
4396 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4397 stmt_info->stmt, 0);
4398 }
4399 return false;
4400 }
4401 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4402 }
4403 }
4404
4405 /* We used to stop processing and prune the list here. Verify we no
4406 longer need to. */
4407 gcc_assert (i == datarefs.length ());
4408
4409 return true;
4410 }
4411
4412
4413 /* Function vect_get_new_vect_var.
4414
4415 Returns a name for a new variable. The current naming scheme appends the
4416 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4417 the name of vectorizer generated variables, and appends that to NAME if
4418 provided. */
4419
4420 tree
4421 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4422 {
4423 const char *prefix;
4424 tree new_vect_var;
4425
4426 switch (var_kind)
4427 {
4428 case vect_simple_var:
4429 prefix = "vect";
4430 break;
4431 case vect_scalar_var:
4432 prefix = "stmp";
4433 break;
4434 case vect_mask_var:
4435 prefix = "mask";
4436 break;
4437 case vect_pointer_var:
4438 prefix = "vectp";
4439 break;
4440 default:
4441 gcc_unreachable ();
4442 }
4443
4444 if (name)
4445 {
4446 char* tmp = concat (prefix, "_", name, NULL);
4447 new_vect_var = create_tmp_reg (type, tmp);
4448 free (tmp);
4449 }
4450 else
4451 new_vect_var = create_tmp_reg (type, prefix);
4452
4453 return new_vect_var;
4454 }
4455
4456 /* Like vect_get_new_vect_var but return an SSA name. */
4457
4458 tree
4459 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4460 {
4461 const char *prefix;
4462 tree new_vect_var;
4463
4464 switch (var_kind)
4465 {
4466 case vect_simple_var:
4467 prefix = "vect";
4468 break;
4469 case vect_scalar_var:
4470 prefix = "stmp";
4471 break;
4472 case vect_pointer_var:
4473 prefix = "vectp";
4474 break;
4475 default:
4476 gcc_unreachable ();
4477 }
4478
4479 if (name)
4480 {
4481 char* tmp = concat (prefix, "_", name, NULL);
4482 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4483 free (tmp);
4484 }
4485 else
4486 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4487
4488 return new_vect_var;
4489 }
4490
4491 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4492
4493 static void
4494 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4495 {
4496 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4497 int misalign = DR_MISALIGNMENT (dr_info);
4498 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4499 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4500 else
4501 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4502 DR_TARGET_ALIGNMENT (dr_info), misalign);
4503 }
4504
4505 /* Function vect_create_addr_base_for_vector_ref.
4506
4507 Create an expression that computes the address of the first memory location
4508 that will be accessed for a data reference.
4509
4510 Input:
4511 STMT_INFO: The statement containing the data reference.
4512 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4513 OFFSET: Optional. If supplied, it is be added to the initial address.
4514 LOOP: Specify relative to which loop-nest should the address be computed.
4515 For example, when the dataref is in an inner-loop nested in an
4516 outer-loop that is now being vectorized, LOOP can be either the
4517 outer-loop, or the inner-loop. The first memory location accessed
4518 by the following dataref ('in' points to short):
4519
4520 for (i=0; i<N; i++)
4521 for (j=0; j<M; j++)
4522 s += in[i+j]
4523
4524 is as follows:
4525 if LOOP=i_loop: &in (relative to i_loop)
4526 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4527 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4528 initial address. Unlike OFFSET, which is number of elements to
4529 be added, BYTE_OFFSET is measured in bytes.
4530
4531 Output:
4532 1. Return an SSA_NAME whose value is the address of the memory location of
4533 the first vector of the data reference.
4534 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4535 these statement(s) which define the returned SSA_NAME.
4536
4537 FORNOW: We are only handling array accesses with step 1. */
4538
4539 tree
4540 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4541 gimple_seq *new_stmt_list,
4542 tree offset,
4543 tree byte_offset)
4544 {
4545 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4546 struct data_reference *dr = dr_info->dr;
4547 const char *base_name;
4548 tree addr_base;
4549 tree dest;
4550 gimple_seq seq = NULL;
4551 tree vect_ptr_type;
4552 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4553 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4554 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4555
4556 tree data_ref_base = unshare_expr (drb->base_address);
4557 tree base_offset = unshare_expr (drb->offset);
4558 tree init = unshare_expr (drb->init);
4559
4560 if (loop_vinfo)
4561 base_name = get_name (data_ref_base);
4562 else
4563 {
4564 base_offset = ssize_int (0);
4565 init = ssize_int (0);
4566 base_name = get_name (DR_REF (dr));
4567 }
4568
4569 /* Create base_offset */
4570 base_offset = size_binop (PLUS_EXPR,
4571 fold_convert (sizetype, base_offset),
4572 fold_convert (sizetype, init));
4573
4574 if (offset)
4575 {
4576 offset = fold_build2 (MULT_EXPR, sizetype,
4577 fold_convert (sizetype, offset), step);
4578 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4579 base_offset, offset);
4580 }
4581 if (byte_offset)
4582 {
4583 byte_offset = fold_convert (sizetype, byte_offset);
4584 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4585 base_offset, byte_offset);
4586 }
4587
4588 /* base + base_offset */
4589 if (loop_vinfo)
4590 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4591 else
4592 {
4593 addr_base = build1 (ADDR_EXPR,
4594 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4595 unshare_expr (DR_REF (dr)));
4596 }
4597
4598 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4599 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4600 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4601 gimple_seq_add_seq (new_stmt_list, seq);
4602
4603 if (DR_PTR_INFO (dr)
4604 && TREE_CODE (addr_base) == SSA_NAME
4605 && !SSA_NAME_PTR_INFO (addr_base))
4606 {
4607 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4608 if (offset || byte_offset)
4609 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4610 }
4611
4612 if (dump_enabled_p ())
4613 {
4614 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4615 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4616 dump_printf (MSG_NOTE, "\n");
4617 }
4618
4619 return addr_base;
4620 }
4621
4622
4623 /* Function vect_create_data_ref_ptr.
4624
4625 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4626 location accessed in the loop by STMT_INFO, along with the def-use update
4627 chain to appropriately advance the pointer through the loop iterations.
4628 Also set aliasing information for the pointer. This pointer is used by
4629 the callers to this function to create a memory reference expression for
4630 vector load/store access.
4631
4632 Input:
4633 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4634 GIMPLE_ASSIGN <name, data-ref> or
4635 GIMPLE_ASSIGN <data-ref, name>.
4636 2. AGGR_TYPE: the type of the reference, which should be either a vector
4637 or an array.
4638 3. AT_LOOP: the loop where the vector memref is to be created.
4639 4. OFFSET (optional): an offset to be added to the initial address accessed
4640 by the data-ref in STMT_INFO.
4641 5. BSI: location where the new stmts are to be placed if there is no loop
4642 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4643 pointing to the initial address.
4644 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4645 to the initial address accessed by the data-ref in STMT_INFO. This is
4646 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4647 in bytes.
4648 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4649 to the IV during each iteration of the loop. NULL says to move
4650 by one copy of AGGR_TYPE up or down, depending on the step of the
4651 data reference.
4652
4653 Output:
4654 1. Declare a new ptr to vector_type, and have it point to the base of the
4655 data reference (initial addressed accessed by the data reference).
4656 For example, for vector of type V8HI, the following code is generated:
4657
4658 v8hi *ap;
4659 ap = (v8hi *)initial_address;
4660
4661 if OFFSET is not supplied:
4662 initial_address = &a[init];
4663 if OFFSET is supplied:
4664 initial_address = &a[init + OFFSET];
4665 if BYTE_OFFSET is supplied:
4666 initial_address = &a[init] + BYTE_OFFSET;
4667
4668 Return the initial_address in INITIAL_ADDRESS.
4669
4670 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4671 update the pointer in each iteration of the loop.
4672
4673 Return the increment stmt that updates the pointer in PTR_INCR.
4674
4675 3. Return the pointer. */
4676
4677 tree
4678 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4679 struct loop *at_loop, tree offset,
4680 tree *initial_address, gimple_stmt_iterator *gsi,
4681 gimple **ptr_incr, bool only_init,
4682 tree byte_offset, tree iv_step)
4683 {
4684 const char *base_name;
4685 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4686 struct loop *loop = NULL;
4687 bool nested_in_vect_loop = false;
4688 struct loop *containing_loop = NULL;
4689 tree aggr_ptr_type;
4690 tree aggr_ptr;
4691 tree new_temp;
4692 gimple_seq new_stmt_list = NULL;
4693 edge pe = NULL;
4694 basic_block new_bb;
4695 tree aggr_ptr_init;
4696 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4697 struct data_reference *dr = dr_info->dr;
4698 tree aptr;
4699 gimple_stmt_iterator incr_gsi;
4700 bool insert_after;
4701 tree indx_before_incr, indx_after_incr;
4702 gimple *incr;
4703 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4704
4705 gcc_assert (iv_step != NULL_TREE
4706 || TREE_CODE (aggr_type) == ARRAY_TYPE
4707 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4708
4709 if (loop_vinfo)
4710 {
4711 loop = LOOP_VINFO_LOOP (loop_vinfo);
4712 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4713 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4714 pe = loop_preheader_edge (loop);
4715 }
4716 else
4717 {
4718 gcc_assert (bb_vinfo);
4719 only_init = true;
4720 *ptr_incr = NULL;
4721 }
4722
4723 /* Create an expression for the first address accessed by this load
4724 in LOOP. */
4725 base_name = get_name (DR_BASE_ADDRESS (dr));
4726
4727 if (dump_enabled_p ())
4728 {
4729 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4730 dump_printf_loc (MSG_NOTE, vect_location,
4731 "create %s-pointer variable to type: ",
4732 get_tree_code_name (TREE_CODE (aggr_type)));
4733 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4734 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4735 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4736 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4737 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4738 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4739 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4740 else
4741 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4742 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4743 dump_printf (MSG_NOTE, "\n");
4744 }
4745
4746 /* (1) Create the new aggregate-pointer variable.
4747 Vector and array types inherit the alias set of their component
4748 type by default so we need to use a ref-all pointer if the data
4749 reference does not conflict with the created aggregated data
4750 reference because it is not addressable. */
4751 bool need_ref_all = false;
4752 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4753 get_alias_set (DR_REF (dr))))
4754 need_ref_all = true;
4755 /* Likewise for any of the data references in the stmt group. */
4756 else if (DR_GROUP_SIZE (stmt_info) > 1)
4757 {
4758 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4759 do
4760 {
4761 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4762 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4763 get_alias_set (DR_REF (sdr))))
4764 {
4765 need_ref_all = true;
4766 break;
4767 }
4768 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4769 }
4770 while (sinfo);
4771 }
4772 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4773 need_ref_all);
4774 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4775
4776
4777 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4778 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4779 def-use update cycles for the pointer: one relative to the outer-loop
4780 (LOOP), which is what steps (3) and (4) below do. The other is relative
4781 to the inner-loop (which is the inner-most loop containing the dataref),
4782 and this is done be step (5) below.
4783
4784 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4785 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4786 redundant. Steps (3),(4) create the following:
4787
4788 vp0 = &base_addr;
4789 LOOP: vp1 = phi(vp0,vp2)
4790 ...
4791 ...
4792 vp2 = vp1 + step
4793 goto LOOP
4794
4795 If there is an inner-loop nested in loop, then step (5) will also be
4796 applied, and an additional update in the inner-loop will be created:
4797
4798 vp0 = &base_addr;
4799 LOOP: vp1 = phi(vp0,vp2)
4800 ...
4801 inner: vp3 = phi(vp1,vp4)
4802 vp4 = vp3 + inner_step
4803 if () goto inner
4804 ...
4805 vp2 = vp1 + step
4806 if () goto LOOP */
4807
4808 /* (2) Calculate the initial address of the aggregate-pointer, and set
4809 the aggregate-pointer to point to it before the loop. */
4810
4811 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4812
4813 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4814 offset, byte_offset);
4815 if (new_stmt_list)
4816 {
4817 if (pe)
4818 {
4819 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4820 gcc_assert (!new_bb);
4821 }
4822 else
4823 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4824 }
4825
4826 *initial_address = new_temp;
4827 aggr_ptr_init = new_temp;
4828
4829 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4830 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4831 inner-loop nested in LOOP (during outer-loop vectorization). */
4832
4833 /* No update in loop is required. */
4834 if (only_init && (!loop_vinfo || at_loop == loop))
4835 aptr = aggr_ptr_init;
4836 else
4837 {
4838 /* Accesses to invariant addresses should be handled specially
4839 by the caller. */
4840 tree step = vect_dr_behavior (dr_info)->step;
4841 gcc_assert (!integer_zerop (step));
4842
4843 if (iv_step == NULL_TREE)
4844 {
4845 /* The step of the aggregate pointer is the type size,
4846 negated for downward accesses. */
4847 iv_step = TYPE_SIZE_UNIT (aggr_type);
4848 if (tree_int_cst_sgn (step) == -1)
4849 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4850 }
4851
4852 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4853
4854 create_iv (aggr_ptr_init,
4855 fold_convert (aggr_ptr_type, iv_step),
4856 aggr_ptr, loop, &incr_gsi, insert_after,
4857 &indx_before_incr, &indx_after_incr);
4858 incr = gsi_stmt (incr_gsi);
4859 loop_vinfo->add_stmt (incr);
4860
4861 /* Copy the points-to information if it exists. */
4862 if (DR_PTR_INFO (dr))
4863 {
4864 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4865 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4866 }
4867 if (ptr_incr)
4868 *ptr_incr = incr;
4869
4870 aptr = indx_before_incr;
4871 }
4872
4873 if (!nested_in_vect_loop || only_init)
4874 return aptr;
4875
4876
4877 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878 nested in LOOP, if exists. */
4879
4880 gcc_assert (nested_in_vect_loop);
4881 if (!only_init)
4882 {
4883 standard_iv_increment_position (containing_loop, &incr_gsi,
4884 &insert_after);
4885 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887 &indx_after_incr);
4888 incr = gsi_stmt (incr_gsi);
4889 loop_vinfo->add_stmt (incr);
4890
4891 /* Copy the points-to information if it exists. */
4892 if (DR_PTR_INFO (dr))
4893 {
4894 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4895 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4896 }
4897 if (ptr_incr)
4898 *ptr_incr = incr;
4899
4900 return indx_before_incr;
4901 }
4902 else
4903 gcc_unreachable ();
4904 }
4905
4906
4907 /* Function bump_vector_ptr
4908
4909 Increment a pointer (to a vector type) by vector-size. If requested,
4910 i.e. if PTR-INCR is given, then also connect the new increment stmt
4911 to the existing def-use update-chain of the pointer, by modifying
4912 the PTR_INCR as illustrated below:
4913
4914 The pointer def-use update-chain before this function:
4915 DATAREF_PTR = phi (p_0, p_2)
4916 ....
4917 PTR_INCR: p_2 = DATAREF_PTR + step
4918
4919 The pointer def-use update-chain after this function:
4920 DATAREF_PTR = phi (p_0, p_2)
4921 ....
4922 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923 ....
4924 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4925
4926 Input:
4927 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928 in the loop.
4929 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930 the loop. The increment amount across iterations is expected
4931 to be vector_size.
4932 BSI - location where the new update stmt is to be placed.
4933 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4934 BUMP - optional. The offset by which to bump the pointer. If not given,
4935 the offset is assumed to be vector_size.
4936
4937 Output: Return NEW_DATAREF_PTR as illustrated above.
4938
4939 */
4940
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943 stmt_vec_info stmt_info, tree bump)
4944 {
4945 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4946 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4947 tree update = TYPE_SIZE_UNIT (vectype);
4948 gassign *incr_stmt;
4949 ssa_op_iter iter;
4950 use_operand_p use_p;
4951 tree new_dataref_ptr;
4952
4953 if (bump)
4954 update = bump;
4955
4956 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4957 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4958 else
4959 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4960 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4961 dataref_ptr, update);
4962 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4963
4964 /* Copy the points-to information if it exists. */
4965 if (DR_PTR_INFO (dr))
4966 {
4967 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4968 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4969 }
4970
4971 if (!ptr_incr)
4972 return new_dataref_ptr;
4973
4974 /* Update the vector-pointer's cross-iteration increment. */
4975 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4976 {
4977 tree use = USE_FROM_PTR (use_p);
4978
4979 if (use == dataref_ptr)
4980 SET_USE (use_p, new_dataref_ptr);
4981 else
4982 gcc_assert (operand_equal_p (use, update, 0));
4983 }
4984
4985 return new_dataref_ptr;
4986 }
4987
4988
4989 /* Copy memory reference info such as base/clique from the SRC reference
4990 to the DEST MEM_REF. */
4991
4992 void
4993 vect_copy_ref_info (tree dest, tree src)
4994 {
4995 if (TREE_CODE (dest) != MEM_REF)
4996 return;
4997
4998 tree src_base = src;
4999 while (handled_component_p (src_base))
5000 src_base = TREE_OPERAND (src_base, 0);
5001 if (TREE_CODE (src_base) != MEM_REF
5002 && TREE_CODE (src_base) != TARGET_MEM_REF)
5003 return;
5004
5005 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5006 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5007 }
5008
5009
5010 /* Function vect_create_destination_var.
5011
5012 Create a new temporary of type VECTYPE. */
5013
5014 tree
5015 vect_create_destination_var (tree scalar_dest, tree vectype)
5016 {
5017 tree vec_dest;
5018 const char *name;
5019 char *new_name;
5020 tree type;
5021 enum vect_var_kind kind;
5022
5023 kind = vectype
5024 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5025 ? vect_mask_var
5026 : vect_simple_var
5027 : vect_scalar_var;
5028 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5029
5030 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5031
5032 name = get_name (scalar_dest);
5033 if (name)
5034 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5035 else
5036 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5037 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5038 free (new_name);
5039
5040 return vec_dest;
5041 }
5042
5043 /* Function vect_grouped_store_supported.
5044
5045 Returns TRUE if interleave high and interleave low permutations
5046 are supported, and FALSE otherwise. */
5047
5048 bool
5049 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5050 {
5051 machine_mode mode = TYPE_MODE (vectype);
5052
5053 /* vect_permute_store_chain requires the group size to be equal to 3 or
5054 be a power of two. */
5055 if (count != 3 && exact_log2 (count) == -1)
5056 {
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "the size of the group of accesses"
5060 " is not a power of 2 or not eqaul to 3\n");
5061 return false;
5062 }
5063
5064 /* Check that the permutation is supported. */
5065 if (VECTOR_MODE_P (mode))
5066 {
5067 unsigned int i;
5068 if (count == 3)
5069 {
5070 unsigned int j0 = 0, j1 = 0, j2 = 0;
5071 unsigned int i, j;
5072
5073 unsigned int nelt;
5074 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5075 {
5076 if (dump_enabled_p ())
5077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078 "cannot handle groups of 3 stores for"
5079 " variable-length vectors\n");
5080 return false;
5081 }
5082
5083 vec_perm_builder sel (nelt, nelt, 1);
5084 sel.quick_grow (nelt);
5085 vec_perm_indices indices;
5086 for (j = 0; j < 3; j++)
5087 {
5088 int nelt0 = ((3 - j) * nelt) % 3;
5089 int nelt1 = ((3 - j) * nelt + 1) % 3;
5090 int nelt2 = ((3 - j) * nelt + 2) % 3;
5091 for (i = 0; i < nelt; i++)
5092 {
5093 if (3 * i + nelt0 < nelt)
5094 sel[3 * i + nelt0] = j0++;
5095 if (3 * i + nelt1 < nelt)
5096 sel[3 * i + nelt1] = nelt + j1++;
5097 if (3 * i + nelt2 < nelt)
5098 sel[3 * i + nelt2] = 0;
5099 }
5100 indices.new_vector (sel, 2, nelt);
5101 if (!can_vec_perm_const_p (mode, indices))
5102 {
5103 if (dump_enabled_p ())
5104 dump_printf (MSG_MISSED_OPTIMIZATION,
5105 "permutation op not supported by target.\n");
5106 return false;
5107 }
5108
5109 for (i = 0; i < nelt; i++)
5110 {
5111 if (3 * i + nelt0 < nelt)
5112 sel[3 * i + nelt0] = 3 * i + nelt0;
5113 if (3 * i + nelt1 < nelt)
5114 sel[3 * i + nelt1] = 3 * i + nelt1;
5115 if (3 * i + nelt2 < nelt)
5116 sel[3 * i + nelt2] = nelt + j2++;
5117 }
5118 indices.new_vector (sel, 2, nelt);
5119 if (!can_vec_perm_const_p (mode, indices))
5120 {
5121 if (dump_enabled_p ())
5122 dump_printf (MSG_MISSED_OPTIMIZATION,
5123 "permutation op not supported by target.\n");
5124 return false;
5125 }
5126 }
5127 return true;
5128 }
5129 else
5130 {
5131 /* If length is not equal to 3 then only power of 2 is supported. */
5132 gcc_assert (pow2p_hwi (count));
5133 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5134
5135 /* The encoding has 2 interleaved stepped patterns. */
5136 vec_perm_builder sel (nelt, 2, 3);
5137 sel.quick_grow (6);
5138 for (i = 0; i < 3; i++)
5139 {
5140 sel[i * 2] = i;
5141 sel[i * 2 + 1] = i + nelt;
5142 }
5143 vec_perm_indices indices (sel, 2, nelt);
5144 if (can_vec_perm_const_p (mode, indices))
5145 {
5146 for (i = 0; i < 6; i++)
5147 sel[i] += exact_div (nelt, 2);
5148 indices.new_vector (sel, 2, nelt);
5149 if (can_vec_perm_const_p (mode, indices))
5150 return true;
5151 }
5152 }
5153 }
5154
5155 if (dump_enabled_p ())
5156 dump_printf (MSG_MISSED_OPTIMIZATION,
5157 "permutation op not supported by target.\n");
5158 return false;
5159 }
5160
5161
5162 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5163 type VECTYPE. MASKED_P says whether the masked form is needed. */
5164
5165 bool
5166 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5167 bool masked_p)
5168 {
5169 if (masked_p)
5170 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5171 vec_mask_store_lanes_optab,
5172 vectype, count);
5173 else
5174 return vect_lanes_optab_supported_p ("vec_store_lanes",
5175 vec_store_lanes_optab,
5176 vectype, count);
5177 }
5178
5179
5180 /* Function vect_permute_store_chain.
5181
5182 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5183 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5184 the data correctly for the stores. Return the final references for stores
5185 in RESULT_CHAIN.
5186
5187 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5188 The input is 4 vectors each containing 8 elements. We assign a number to
5189 each element, the input sequence is:
5190
5191 1st vec: 0 1 2 3 4 5 6 7
5192 2nd vec: 8 9 10 11 12 13 14 15
5193 3rd vec: 16 17 18 19 20 21 22 23
5194 4th vec: 24 25 26 27 28 29 30 31
5195
5196 The output sequence should be:
5197
5198 1st vec: 0 8 16 24 1 9 17 25
5199 2nd vec: 2 10 18 26 3 11 19 27
5200 3rd vec: 4 12 20 28 5 13 21 30
5201 4th vec: 6 14 22 30 7 15 23 31
5202
5203 i.e., we interleave the contents of the four vectors in their order.
5204
5205 We use interleave_high/low instructions to create such output. The input of
5206 each interleave_high/low operation is two vectors:
5207 1st vec 2nd vec
5208 0 1 2 3 4 5 6 7
5209 the even elements of the result vector are obtained left-to-right from the
5210 high/low elements of the first vector. The odd elements of the result are
5211 obtained left-to-right from the high/low elements of the second vector.
5212 The output of interleave_high will be: 0 4 1 5
5213 and of interleave_low: 2 6 3 7
5214
5215
5216 The permutation is done in log LENGTH stages. In each stage interleave_high
5217 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5218 where the first argument is taken from the first half of DR_CHAIN and the
5219 second argument from it's second half.
5220 In our example,
5221
5222 I1: interleave_high (1st vec, 3rd vec)
5223 I2: interleave_low (1st vec, 3rd vec)
5224 I3: interleave_high (2nd vec, 4th vec)
5225 I4: interleave_low (2nd vec, 4th vec)
5226
5227 The output for the first stage is:
5228
5229 I1: 0 16 1 17 2 18 3 19
5230 I2: 4 20 5 21 6 22 7 23
5231 I3: 8 24 9 25 10 26 11 27
5232 I4: 12 28 13 29 14 30 15 31
5233
5234 The output of the second stage, i.e. the final result is:
5235
5236 I1: 0 8 16 24 1 9 17 25
5237 I2: 2 10 18 26 3 11 19 27
5238 I3: 4 12 20 28 5 13 21 30
5239 I4: 6 14 22 30 7 15 23 31. */
5240
5241 void
5242 vect_permute_store_chain (vec<tree> dr_chain,
5243 unsigned int length,
5244 stmt_vec_info stmt_info,
5245 gimple_stmt_iterator *gsi,
5246 vec<tree> *result_chain)
5247 {
5248 tree vect1, vect2, high, low;
5249 gimple *perm_stmt;
5250 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5251 tree perm_mask_low, perm_mask_high;
5252 tree data_ref;
5253 tree perm3_mask_low, perm3_mask_high;
5254 unsigned int i, j, n, log_length = exact_log2 (length);
5255
5256 result_chain->quick_grow (length);
5257 memcpy (result_chain->address (), dr_chain.address (),
5258 length * sizeof (tree));
5259
5260 if (length == 3)
5261 {
5262 /* vect_grouped_store_supported ensures that this is constant. */
5263 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5264 unsigned int j0 = 0, j1 = 0, j2 = 0;
5265
5266 vec_perm_builder sel (nelt, nelt, 1);
5267 sel.quick_grow (nelt);
5268 vec_perm_indices indices;
5269 for (j = 0; j < 3; j++)
5270 {
5271 int nelt0 = ((3 - j) * nelt) % 3;
5272 int nelt1 = ((3 - j) * nelt + 1) % 3;
5273 int nelt2 = ((3 - j) * nelt + 2) % 3;
5274
5275 for (i = 0; i < nelt; i++)
5276 {
5277 if (3 * i + nelt0 < nelt)
5278 sel[3 * i + nelt0] = j0++;
5279 if (3 * i + nelt1 < nelt)
5280 sel[3 * i + nelt1] = nelt + j1++;
5281 if (3 * i + nelt2 < nelt)
5282 sel[3 * i + nelt2] = 0;
5283 }
5284 indices.new_vector (sel, 2, nelt);
5285 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5286
5287 for (i = 0; i < nelt; i++)
5288 {
5289 if (3 * i + nelt0 < nelt)
5290 sel[3 * i + nelt0] = 3 * i + nelt0;
5291 if (3 * i + nelt1 < nelt)
5292 sel[3 * i + nelt1] = 3 * i + nelt1;
5293 if (3 * i + nelt2 < nelt)
5294 sel[3 * i + nelt2] = nelt + j2++;
5295 }
5296 indices.new_vector (sel, 2, nelt);
5297 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5298
5299 vect1 = dr_chain[0];
5300 vect2 = dr_chain[1];
5301
5302 /* Create interleaving stmt:
5303 low = VEC_PERM_EXPR <vect1, vect2,
5304 {j, nelt, *, j + 1, nelt + j + 1, *,
5305 j + 2, nelt + j + 2, *, ...}> */
5306 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5307 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5308 vect2, perm3_mask_low);
5309 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5310
5311 vect1 = data_ref;
5312 vect2 = dr_chain[2];
5313 /* Create interleaving stmt:
5314 low = VEC_PERM_EXPR <vect1, vect2,
5315 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5316 6, 7, nelt + j + 2, ...}> */
5317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5319 vect2, perm3_mask_high);
5320 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5321 (*result_chain)[j] = data_ref;
5322 }
5323 }
5324 else
5325 {
5326 /* If length is not equal to 3 then only power of 2 is supported. */
5327 gcc_assert (pow2p_hwi (length));
5328
5329 /* The encoding has 2 interleaved stepped patterns. */
5330 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5331 vec_perm_builder sel (nelt, 2, 3);
5332 sel.quick_grow (6);
5333 for (i = 0; i < 3; i++)
5334 {
5335 sel[i * 2] = i;
5336 sel[i * 2 + 1] = i + nelt;
5337 }
5338 vec_perm_indices indices (sel, 2, nelt);
5339 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5340
5341 for (i = 0; i < 6; i++)
5342 sel[i] += exact_div (nelt, 2);
5343 indices.new_vector (sel, 2, nelt);
5344 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5345
5346 for (i = 0, n = log_length; i < n; i++)
5347 {
5348 for (j = 0; j < length/2; j++)
5349 {
5350 vect1 = dr_chain[j];
5351 vect2 = dr_chain[j+length/2];
5352
5353 /* Create interleaving stmt:
5354 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5355 ...}> */
5356 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5357 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5358 vect2, perm_mask_high);
5359 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5360 (*result_chain)[2*j] = high;
5361
5362 /* Create interleaving stmt:
5363 low = VEC_PERM_EXPR <vect1, vect2,
5364 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5365 ...}> */
5366 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5367 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5368 vect2, perm_mask_low);
5369 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5370 (*result_chain)[2*j+1] = low;
5371 }
5372 memcpy (dr_chain.address (), result_chain->address (),
5373 length * sizeof (tree));
5374 }
5375 }
5376 }
5377
5378 /* Function vect_setup_realignment
5379
5380 This function is called when vectorizing an unaligned load using
5381 the dr_explicit_realign[_optimized] scheme.
5382 This function generates the following code at the loop prolog:
5383
5384 p = initial_addr;
5385 x msq_init = *(floor(p)); # prolog load
5386 realignment_token = call target_builtin;
5387 loop:
5388 x msq = phi (msq_init, ---)
5389
5390 The stmts marked with x are generated only for the case of
5391 dr_explicit_realign_optimized.
5392
5393 The code above sets up a new (vector) pointer, pointing to the first
5394 location accessed by STMT_INFO, and a "floor-aligned" load using that
5395 pointer. It also generates code to compute the "realignment-token"
5396 (if the relevant target hook was defined), and creates a phi-node at the
5397 loop-header bb whose arguments are the result of the prolog-load (created
5398 by this function) and the result of a load that takes place in the loop
5399 (to be created by the caller to this function).
5400
5401 For the case of dr_explicit_realign_optimized:
5402 The caller to this function uses the phi-result (msq) to create the
5403 realignment code inside the loop, and sets up the missing phi argument,
5404 as follows:
5405 loop:
5406 msq = phi (msq_init, lsq)
5407 lsq = *(floor(p')); # load in loop
5408 result = realign_load (msq, lsq, realignment_token);
5409
5410 For the case of dr_explicit_realign:
5411 loop:
5412 msq = *(floor(p)); # load in loop
5413 p' = p + (VS-1);
5414 lsq = *(floor(p')); # load in loop
5415 result = realign_load (msq, lsq, realignment_token);
5416
5417 Input:
5418 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5419 a memory location that may be unaligned.
5420 BSI - place where new code is to be inserted.
5421 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5422 is used.
5423
5424 Output:
5425 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5426 target hook, if defined.
5427 Return value - the result of the loop-header phi node. */
5428
5429 tree
5430 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5431 tree *realignment_token,
5432 enum dr_alignment_support alignment_support_scheme,
5433 tree init_addr,
5434 struct loop **at_loop)
5435 {
5436 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5437 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5439 struct data_reference *dr = dr_info->dr;
5440 struct loop *loop = NULL;
5441 edge pe = NULL;
5442 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5443 tree vec_dest;
5444 gimple *inc;
5445 tree ptr;
5446 tree data_ref;
5447 basic_block new_bb;
5448 tree msq_init = NULL_TREE;
5449 tree new_temp;
5450 gphi *phi_stmt;
5451 tree msq = NULL_TREE;
5452 gimple_seq stmts = NULL;
5453 bool compute_in_loop = false;
5454 bool nested_in_vect_loop = false;
5455 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5456 struct loop *loop_for_initial_load = NULL;
5457
5458 if (loop_vinfo)
5459 {
5460 loop = LOOP_VINFO_LOOP (loop_vinfo);
5461 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5462 }
5463
5464 gcc_assert (alignment_support_scheme == dr_explicit_realign
5465 || alignment_support_scheme == dr_explicit_realign_optimized);
5466
5467 /* We need to generate three things:
5468 1. the misalignment computation
5469 2. the extra vector load (for the optimized realignment scheme).
5470 3. the phi node for the two vectors from which the realignment is
5471 done (for the optimized realignment scheme). */
5472
5473 /* 1. Determine where to generate the misalignment computation.
5474
5475 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5476 calculation will be generated by this function, outside the loop (in the
5477 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5478 caller, inside the loop.
5479
5480 Background: If the misalignment remains fixed throughout the iterations of
5481 the loop, then both realignment schemes are applicable, and also the
5482 misalignment computation can be done outside LOOP. This is because we are
5483 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5484 are a multiple of VS (the Vector Size), and therefore the misalignment in
5485 different vectorized LOOP iterations is always the same.
5486 The problem arises only if the memory access is in an inner-loop nested
5487 inside LOOP, which is now being vectorized using outer-loop vectorization.
5488 This is the only case when the misalignment of the memory access may not
5489 remain fixed throughout the iterations of the inner-loop (as explained in
5490 detail in vect_supportable_dr_alignment). In this case, not only is the
5491 optimized realignment scheme not applicable, but also the misalignment
5492 computation (and generation of the realignment token that is passed to
5493 REALIGN_LOAD) have to be done inside the loop.
5494
5495 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5496 or not, which in turn determines if the misalignment is computed inside
5497 the inner-loop, or outside LOOP. */
5498
5499 if (init_addr != NULL_TREE || !loop_vinfo)
5500 {
5501 compute_in_loop = true;
5502 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5503 }
5504
5505
5506 /* 2. Determine where to generate the extra vector load.
5507
5508 For the optimized realignment scheme, instead of generating two vector
5509 loads in each iteration, we generate a single extra vector load in the
5510 preheader of the loop, and in each iteration reuse the result of the
5511 vector load from the previous iteration. In case the memory access is in
5512 an inner-loop nested inside LOOP, which is now being vectorized using
5513 outer-loop vectorization, we need to determine whether this initial vector
5514 load should be generated at the preheader of the inner-loop, or can be
5515 generated at the preheader of LOOP. If the memory access has no evolution
5516 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5517 to be generated inside LOOP (in the preheader of the inner-loop). */
5518
5519 if (nested_in_vect_loop)
5520 {
5521 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5522 bool invariant_in_outerloop =
5523 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5524 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5525 }
5526 else
5527 loop_for_initial_load = loop;
5528 if (at_loop)
5529 *at_loop = loop_for_initial_load;
5530
5531 if (loop_for_initial_load)
5532 pe = loop_preheader_edge (loop_for_initial_load);
5533
5534 /* 3. For the case of the optimized realignment, create the first vector
5535 load at the loop preheader. */
5536
5537 if (alignment_support_scheme == dr_explicit_realign_optimized)
5538 {
5539 /* Create msq_init = *(floor(p1)) in the loop preheader */
5540 gassign *new_stmt;
5541
5542 gcc_assert (!compute_in_loop);
5543 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5544 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5545 loop_for_initial_load, NULL_TREE,
5546 &init_addr, NULL, &inc, true);
5547 if (TREE_CODE (ptr) == SSA_NAME)
5548 new_temp = copy_ssa_name (ptr);
5549 else
5550 new_temp = make_ssa_name (TREE_TYPE (ptr));
5551 unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5552 new_stmt = gimple_build_assign
5553 (new_temp, BIT_AND_EXPR, ptr,
5554 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5555 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5556 gcc_assert (!new_bb);
5557 data_ref
5558 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5559 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5560 vect_copy_ref_info (data_ref, DR_REF (dr));
5561 new_stmt = gimple_build_assign (vec_dest, data_ref);
5562 new_temp = make_ssa_name (vec_dest, new_stmt);
5563 gimple_assign_set_lhs (new_stmt, new_temp);
5564 if (pe)
5565 {
5566 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567 gcc_assert (!new_bb);
5568 }
5569 else
5570 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5571
5572 msq_init = gimple_assign_lhs (new_stmt);
5573 }
5574
5575 /* 4. Create realignment token using a target builtin, if available.
5576 It is done either inside the containing loop, or before LOOP (as
5577 determined above). */
5578
5579 if (targetm.vectorize.builtin_mask_for_load)
5580 {
5581 gcall *new_stmt;
5582 tree builtin_decl;
5583
5584 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5585 if (!init_addr)
5586 {
5587 /* Generate the INIT_ADDR computation outside LOOP. */
5588 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5589 NULL_TREE);
5590 if (loop)
5591 {
5592 pe = loop_preheader_edge (loop);
5593 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5594 gcc_assert (!new_bb);
5595 }
5596 else
5597 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5598 }
5599
5600 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5601 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5602 vec_dest =
5603 vect_create_destination_var (scalar_dest,
5604 gimple_call_return_type (new_stmt));
5605 new_temp = make_ssa_name (vec_dest, new_stmt);
5606 gimple_call_set_lhs (new_stmt, new_temp);
5607
5608 if (compute_in_loop)
5609 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610 else
5611 {
5612 /* Generate the misalignment computation outside LOOP. */
5613 pe = loop_preheader_edge (loop);
5614 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5615 gcc_assert (!new_bb);
5616 }
5617
5618 *realignment_token = gimple_call_lhs (new_stmt);
5619
5620 /* The result of the CALL_EXPR to this builtin is determined from
5621 the value of the parameter and no global variables are touched
5622 which makes the builtin a "const" function. Requiring the
5623 builtin to have the "const" attribute makes it unnecessary
5624 to call mark_call_clobbered. */
5625 gcc_assert (TREE_READONLY (builtin_decl));
5626 }
5627
5628 if (alignment_support_scheme == dr_explicit_realign)
5629 return msq;
5630
5631 gcc_assert (!compute_in_loop);
5632 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5633
5634
5635 /* 5. Create msq = phi <msq_init, lsq> in loop */
5636
5637 pe = loop_preheader_edge (containing_loop);
5638 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5639 msq = make_ssa_name (vec_dest);
5640 phi_stmt = create_phi_node (msq, containing_loop->header);
5641 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5642
5643 return msq;
5644 }
5645
5646
5647 /* Function vect_grouped_load_supported.
5648
5649 COUNT is the size of the load group (the number of statements plus the
5650 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5651 only one statement, with a gap of COUNT - 1.
5652
5653 Returns true if a suitable permute exists. */
5654
5655 bool
5656 vect_grouped_load_supported (tree vectype, bool single_element_p,
5657 unsigned HOST_WIDE_INT count)
5658 {
5659 machine_mode mode = TYPE_MODE (vectype);
5660
5661 /* If this is single-element interleaving with an element distance
5662 that leaves unused vector loads around punt - we at least create
5663 very sub-optimal code in that case (and blow up memory,
5664 see PR65518). */
5665 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5666 {
5667 if (dump_enabled_p ())
5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669 "single-element interleaving not supported "
5670 "for not adjacent vector loads\n");
5671 return false;
5672 }
5673
5674 /* vect_permute_load_chain requires the group size to be equal to 3 or
5675 be a power of two. */
5676 if (count != 3 && exact_log2 (count) == -1)
5677 {
5678 if (dump_enabled_p ())
5679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680 "the size of the group of accesses"
5681 " is not a power of 2 or not equal to 3\n");
5682 return false;
5683 }
5684
5685 /* Check that the permutation is supported. */
5686 if (VECTOR_MODE_P (mode))
5687 {
5688 unsigned int i, j;
5689 if (count == 3)
5690 {
5691 unsigned int nelt;
5692 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5693 {
5694 if (dump_enabled_p ())
5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696 "cannot handle groups of 3 loads for"
5697 " variable-length vectors\n");
5698 return false;
5699 }
5700
5701 vec_perm_builder sel (nelt, nelt, 1);
5702 sel.quick_grow (nelt);
5703 vec_perm_indices indices;
5704 unsigned int k;
5705 for (k = 0; k < 3; k++)
5706 {
5707 for (i = 0; i < nelt; i++)
5708 if (3 * i + k < 2 * nelt)
5709 sel[i] = 3 * i + k;
5710 else
5711 sel[i] = 0;
5712 indices.new_vector (sel, 2, nelt);
5713 if (!can_vec_perm_const_p (mode, indices))
5714 {
5715 if (dump_enabled_p ())
5716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5717 "shuffle of 3 loads is not supported by"
5718 " target\n");
5719 return false;
5720 }
5721 for (i = 0, j = 0; i < nelt; i++)
5722 if (3 * i + k < 2 * nelt)
5723 sel[i] = i;
5724 else
5725 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5726 indices.new_vector (sel, 2, nelt);
5727 if (!can_vec_perm_const_p (mode, indices))
5728 {
5729 if (dump_enabled_p ())
5730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5731 "shuffle of 3 loads is not supported by"
5732 " target\n");
5733 return false;
5734 }
5735 }
5736 return true;
5737 }
5738 else
5739 {
5740 /* If length is not equal to 3 then only power of 2 is supported. */
5741 gcc_assert (pow2p_hwi (count));
5742 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5743
5744 /* The encoding has a single stepped pattern. */
5745 vec_perm_builder sel (nelt, 1, 3);
5746 sel.quick_grow (3);
5747 for (i = 0; i < 3; i++)
5748 sel[i] = i * 2;
5749 vec_perm_indices indices (sel, 2, nelt);
5750 if (can_vec_perm_const_p (mode, indices))
5751 {
5752 for (i = 0; i < 3; i++)
5753 sel[i] = i * 2 + 1;
5754 indices.new_vector (sel, 2, nelt);
5755 if (can_vec_perm_const_p (mode, indices))
5756 return true;
5757 }
5758 }
5759 }
5760
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "extract even/odd not supported by target\n");
5764 return false;
5765 }
5766
5767 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5768 type VECTYPE. MASKED_P says whether the masked form is needed. */
5769
5770 bool
5771 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5772 bool masked_p)
5773 {
5774 if (masked_p)
5775 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5776 vec_mask_load_lanes_optab,
5777 vectype, count);
5778 else
5779 return vect_lanes_optab_supported_p ("vec_load_lanes",
5780 vec_load_lanes_optab,
5781 vectype, count);
5782 }
5783
5784 /* Function vect_permute_load_chain.
5785
5786 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5787 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5788 the input data correctly. Return the final references for loads in
5789 RESULT_CHAIN.
5790
5791 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5792 The input is 4 vectors each containing 8 elements. We assign a number to each
5793 element, the input sequence is:
5794
5795 1st vec: 0 1 2 3 4 5 6 7
5796 2nd vec: 8 9 10 11 12 13 14 15
5797 3rd vec: 16 17 18 19 20 21 22 23
5798 4th vec: 24 25 26 27 28 29 30 31
5799
5800 The output sequence should be:
5801
5802 1st vec: 0 4 8 12 16 20 24 28
5803 2nd vec: 1 5 9 13 17 21 25 29
5804 3rd vec: 2 6 10 14 18 22 26 30
5805 4th vec: 3 7 11 15 19 23 27 31
5806
5807 i.e., the first output vector should contain the first elements of each
5808 interleaving group, etc.
5809
5810 We use extract_even/odd instructions to create such output. The input of
5811 each extract_even/odd operation is two vectors
5812 1st vec 2nd vec
5813 0 1 2 3 4 5 6 7
5814
5815 and the output is the vector of extracted even/odd elements. The output of
5816 extract_even will be: 0 2 4 6
5817 and of extract_odd: 1 3 5 7
5818
5819
5820 The permutation is done in log LENGTH stages. In each stage extract_even
5821 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5822 their order. In our example,
5823
5824 E1: extract_even (1st vec, 2nd vec)
5825 E2: extract_odd (1st vec, 2nd vec)
5826 E3: extract_even (3rd vec, 4th vec)
5827 E4: extract_odd (3rd vec, 4th vec)
5828
5829 The output for the first stage will be:
5830
5831 E1: 0 2 4 6 8 10 12 14
5832 E2: 1 3 5 7 9 11 13 15
5833 E3: 16 18 20 22 24 26 28 30
5834 E4: 17 19 21 23 25 27 29 31
5835
5836 In order to proceed and create the correct sequence for the next stage (or
5837 for the correct output, if the second stage is the last one, as in our
5838 example), we first put the output of extract_even operation and then the
5839 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5840 The input for the second stage is:
5841
5842 1st vec (E1): 0 2 4 6 8 10 12 14
5843 2nd vec (E3): 16 18 20 22 24 26 28 30
5844 3rd vec (E2): 1 3 5 7 9 11 13 15
5845 4th vec (E4): 17 19 21 23 25 27 29 31
5846
5847 The output of the second stage:
5848
5849 E1: 0 4 8 12 16 20 24 28
5850 E2: 2 6 10 14 18 22 26 30
5851 E3: 1 5 9 13 17 21 25 29
5852 E4: 3 7 11 15 19 23 27 31
5853
5854 And RESULT_CHAIN after reordering:
5855
5856 1st vec (E1): 0 4 8 12 16 20 24 28
5857 2nd vec (E3): 1 5 9 13 17 21 25 29
5858 3rd vec (E2): 2 6 10 14 18 22 26 30
5859 4th vec (E4): 3 7 11 15 19 23 27 31. */
5860
5861 static void
5862 vect_permute_load_chain (vec<tree> dr_chain,
5863 unsigned int length,
5864 stmt_vec_info stmt_info,
5865 gimple_stmt_iterator *gsi,
5866 vec<tree> *result_chain)
5867 {
5868 tree data_ref, first_vect, second_vect;
5869 tree perm_mask_even, perm_mask_odd;
5870 tree perm3_mask_low, perm3_mask_high;
5871 gimple *perm_stmt;
5872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5873 unsigned int i, j, log_length = exact_log2 (length);
5874
5875 result_chain->quick_grow (length);
5876 memcpy (result_chain->address (), dr_chain.address (),
5877 length * sizeof (tree));
5878
5879 if (length == 3)
5880 {
5881 /* vect_grouped_load_supported ensures that this is constant. */
5882 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5883 unsigned int k;
5884
5885 vec_perm_builder sel (nelt, nelt, 1);
5886 sel.quick_grow (nelt);
5887 vec_perm_indices indices;
5888 for (k = 0; k < 3; k++)
5889 {
5890 for (i = 0; i < nelt; i++)
5891 if (3 * i + k < 2 * nelt)
5892 sel[i] = 3 * i + k;
5893 else
5894 sel[i] = 0;
5895 indices.new_vector (sel, 2, nelt);
5896 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5897
5898 for (i = 0, j = 0; i < nelt; i++)
5899 if (3 * i + k < 2 * nelt)
5900 sel[i] = i;
5901 else
5902 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5903 indices.new_vector (sel, 2, nelt);
5904 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5905
5906 first_vect = dr_chain[0];
5907 second_vect = dr_chain[1];
5908
5909 /* Create interleaving stmt (low part of):
5910 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5911 ...}> */
5912 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5913 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5914 second_vect, perm3_mask_low);
5915 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5916
5917 /* Create interleaving stmt (high part of):
5918 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5919 ...}> */
5920 first_vect = data_ref;
5921 second_vect = dr_chain[2];
5922 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5923 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5924 second_vect, perm3_mask_high);
5925 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5926 (*result_chain)[k] = data_ref;
5927 }
5928 }
5929 else
5930 {
5931 /* If length is not equal to 3 then only power of 2 is supported. */
5932 gcc_assert (pow2p_hwi (length));
5933
5934 /* The encoding has a single stepped pattern. */
5935 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5936 vec_perm_builder sel (nelt, 1, 3);
5937 sel.quick_grow (3);
5938 for (i = 0; i < 3; ++i)
5939 sel[i] = i * 2;
5940 vec_perm_indices indices (sel, 2, nelt);
5941 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5942
5943 for (i = 0; i < 3; ++i)
5944 sel[i] = i * 2 + 1;
5945 indices.new_vector (sel, 2, nelt);
5946 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5947
5948 for (i = 0; i < log_length; i++)
5949 {
5950 for (j = 0; j < length; j += 2)
5951 {
5952 first_vect = dr_chain[j];
5953 second_vect = dr_chain[j+1];
5954
5955 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5956 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5957 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5958 first_vect, second_vect,
5959 perm_mask_even);
5960 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5961 (*result_chain)[j/2] = data_ref;
5962
5963 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5964 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5965 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5966 first_vect, second_vect,
5967 perm_mask_odd);
5968 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5969 (*result_chain)[j/2+length/2] = data_ref;
5970 }
5971 memcpy (dr_chain.address (), result_chain->address (),
5972 length * sizeof (tree));
5973 }
5974 }
5975 }
5976
5977 /* Function vect_shift_permute_load_chain.
5978
5979 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5980 sequence of stmts to reorder the input data accordingly.
5981 Return the final references for loads in RESULT_CHAIN.
5982 Return true if successed, false otherwise.
5983
5984 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5985 The input is 3 vectors each containing 8 elements. We assign a
5986 number to each element, the input sequence is:
5987
5988 1st vec: 0 1 2 3 4 5 6 7
5989 2nd vec: 8 9 10 11 12 13 14 15
5990 3rd vec: 16 17 18 19 20 21 22 23
5991
5992 The output sequence should be:
5993
5994 1st vec: 0 3 6 9 12 15 18 21
5995 2nd vec: 1 4 7 10 13 16 19 22
5996 3rd vec: 2 5 8 11 14 17 20 23
5997
5998 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5999
6000 First we shuffle all 3 vectors to get correct elements order:
6001
6002 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6003 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6004 3rd vec: (16 19 22) (17 20 23) (18 21)
6005
6006 Next we unite and shift vector 3 times:
6007
6008 1st step:
6009 shift right by 6 the concatenation of:
6010 "1st vec" and "2nd vec"
6011 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6012 "2nd vec" and "3rd vec"
6013 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6014 "3rd vec" and "1st vec"
6015 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6016 | New vectors |
6017
6018 So that now new vectors are:
6019
6020 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6021 2nd vec: (10 13) (16 19 22) (17 20 23)
6022 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6023
6024 2nd step:
6025 shift right by 5 the concatenation of:
6026 "1st vec" and "3rd vec"
6027 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6028 "2nd vec" and "1st vec"
6029 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6030 "3rd vec" and "2nd vec"
6031 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6032 | New vectors |
6033
6034 So that now new vectors are:
6035
6036 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6037 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6038 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6039
6040 3rd step:
6041 shift right by 5 the concatenation of:
6042 "1st vec" and "1st vec"
6043 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6044 shift right by 3 the concatenation of:
6045 "2nd vec" and "2nd vec"
6046 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6047 | New vectors |
6048
6049 So that now all vectors are READY:
6050 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6051 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6052 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6053
6054 This algorithm is faster than one in vect_permute_load_chain if:
6055 1. "shift of a concatination" is faster than general permutation.
6056 This is usually so.
6057 2. The TARGET machine can't execute vector instructions in parallel.
6058 This is because each step of the algorithm depends on previous.
6059 The algorithm in vect_permute_load_chain is much more parallel.
6060
6061 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6062 */
6063
6064 static bool
6065 vect_shift_permute_load_chain (vec<tree> dr_chain,
6066 unsigned int length,
6067 stmt_vec_info stmt_info,
6068 gimple_stmt_iterator *gsi,
6069 vec<tree> *result_chain)
6070 {
6071 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6072 tree perm2_mask1, perm2_mask2, perm3_mask;
6073 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6074 gimple *perm_stmt;
6075
6076 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6077 unsigned int i;
6078 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6079
6080 unsigned HOST_WIDE_INT nelt, vf;
6081 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6082 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6083 /* Not supported for variable-length vectors. */
6084 return false;
6085
6086 vec_perm_builder sel (nelt, nelt, 1);
6087 sel.quick_grow (nelt);
6088
6089 result_chain->quick_grow (length);
6090 memcpy (result_chain->address (), dr_chain.address (),
6091 length * sizeof (tree));
6092
6093 if (pow2p_hwi (length) && vf > 4)
6094 {
6095 unsigned int j, log_length = exact_log2 (length);
6096 for (i = 0; i < nelt / 2; ++i)
6097 sel[i] = i * 2;
6098 for (i = 0; i < nelt / 2; ++i)
6099 sel[nelt / 2 + i] = i * 2 + 1;
6100 vec_perm_indices indices (sel, 2, nelt);
6101 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6102 {
6103 if (dump_enabled_p ())
6104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6105 "shuffle of 2 fields structure is not \
6106 supported by target\n");
6107 return false;
6108 }
6109 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6110
6111 for (i = 0; i < nelt / 2; ++i)
6112 sel[i] = i * 2 + 1;
6113 for (i = 0; i < nelt / 2; ++i)
6114 sel[nelt / 2 + i] = i * 2;
6115 indices.new_vector (sel, 2, nelt);
6116 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6117 {
6118 if (dump_enabled_p ())
6119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120 "shuffle of 2 fields structure is not \
6121 supported by target\n");
6122 return false;
6123 }
6124 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6125
6126 /* Generating permutation constant to shift all elements.
6127 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6128 for (i = 0; i < nelt; i++)
6129 sel[i] = nelt / 2 + i;
6130 indices.new_vector (sel, 2, nelt);
6131 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132 {
6133 if (dump_enabled_p ())
6134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135 "shift permutation is not supported by target\n");
6136 return false;
6137 }
6138 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6139
6140 /* Generating permutation constant to select vector from 2.
6141 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6142 for (i = 0; i < nelt / 2; i++)
6143 sel[i] = i;
6144 for (i = nelt / 2; i < nelt; i++)
6145 sel[i] = nelt + i;
6146 indices.new_vector (sel, 2, nelt);
6147 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6148 {
6149 if (dump_enabled_p ())
6150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151 "select is not supported by target\n");
6152 return false;
6153 }
6154 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6155
6156 for (i = 0; i < log_length; i++)
6157 {
6158 for (j = 0; j < length; j += 2)
6159 {
6160 first_vect = dr_chain[j];
6161 second_vect = dr_chain[j + 1];
6162
6163 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6164 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6165 first_vect, first_vect,
6166 perm2_mask1);
6167 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6168 vect[0] = data_ref;
6169
6170 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6171 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172 second_vect, second_vect,
6173 perm2_mask2);
6174 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6175 vect[1] = data_ref;
6176
6177 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6178 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6179 vect[0], vect[1], shift1_mask);
6180 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6181 (*result_chain)[j/2 + length/2] = data_ref;
6182
6183 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6184 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6185 vect[0], vect[1], select_mask);
6186 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6187 (*result_chain)[j/2] = data_ref;
6188 }
6189 memcpy (dr_chain.address (), result_chain->address (),
6190 length * sizeof (tree));
6191 }
6192 return true;
6193 }
6194 if (length == 3 && vf > 2)
6195 {
6196 unsigned int k = 0, l = 0;
6197
6198 /* Generating permutation constant to get all elements in rigth order.
6199 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6200 for (i = 0; i < nelt; i++)
6201 {
6202 if (3 * k + (l % 3) >= nelt)
6203 {
6204 k = 0;
6205 l += (3 - (nelt % 3));
6206 }
6207 sel[i] = 3 * k + (l % 3);
6208 k++;
6209 }
6210 vec_perm_indices indices (sel, 2, nelt);
6211 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6212 {
6213 if (dump_enabled_p ())
6214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215 "shuffle of 3 fields structure is not \
6216 supported by target\n");
6217 return false;
6218 }
6219 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6220
6221 /* Generating permutation constant to shift all elements.
6222 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6223 for (i = 0; i < nelt; i++)
6224 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6225 indices.new_vector (sel, 2, nelt);
6226 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6227 {
6228 if (dump_enabled_p ())
6229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230 "shift permutation is not supported by target\n");
6231 return false;
6232 }
6233 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6234
6235 /* Generating permutation constant to shift all elements.
6236 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6237 for (i = 0; i < nelt; i++)
6238 sel[i] = 2 * (nelt / 3) + 1 + i;
6239 indices.new_vector (sel, 2, nelt);
6240 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241 {
6242 if (dump_enabled_p ())
6243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244 "shift permutation is not supported by target\n");
6245 return false;
6246 }
6247 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6248
6249 /* Generating permutation constant to shift all elements.
6250 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6251 for (i = 0; i < nelt; i++)
6252 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6253 indices.new_vector (sel, 2, nelt);
6254 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6255 {
6256 if (dump_enabled_p ())
6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258 "shift permutation is not supported by target\n");
6259 return false;
6260 }
6261 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6262
6263 /* Generating permutation constant to shift all elements.
6264 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6265 for (i = 0; i < nelt; i++)
6266 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6267 indices.new_vector (sel, 2, nelt);
6268 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6269 {
6270 if (dump_enabled_p ())
6271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272 "shift permutation is not supported by target\n");
6273 return false;
6274 }
6275 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6276
6277 for (k = 0; k < 3; k++)
6278 {
6279 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6280 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6281 dr_chain[k], dr_chain[k],
6282 perm3_mask);
6283 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6284 vect[k] = data_ref;
6285 }
6286
6287 for (k = 0; k < 3; k++)
6288 {
6289 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6290 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6291 vect[k % 3], vect[(k + 1) % 3],
6292 shift1_mask);
6293 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6294 vect_shift[k] = data_ref;
6295 }
6296
6297 for (k = 0; k < 3; k++)
6298 {
6299 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6300 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6301 vect_shift[(4 - k) % 3],
6302 vect_shift[(3 - k) % 3],
6303 shift2_mask);
6304 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6305 vect[k] = data_ref;
6306 }
6307
6308 (*result_chain)[3 - (nelt % 3)] = vect[2];
6309
6310 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6311 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6312 vect[0], shift3_mask);
6313 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6314 (*result_chain)[nelt % 3] = data_ref;
6315
6316 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6317 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6318 vect[1], shift4_mask);
6319 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6320 (*result_chain)[0] = data_ref;
6321 return true;
6322 }
6323 return false;
6324 }
6325
6326 /* Function vect_transform_grouped_load.
6327
6328 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6329 to perform their permutation and ascribe the result vectorized statements to
6330 the scalar statements.
6331 */
6332
6333 void
6334 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6335 int size, gimple_stmt_iterator *gsi)
6336 {
6337 machine_mode mode;
6338 vec<tree> result_chain = vNULL;
6339
6340 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6341 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6342 vectors, that are ready for vector computation. */
6343 result_chain.create (size);
6344
6345 /* If reassociation width for vector type is 2 or greater target machine can
6346 execute 2 or more vector instructions in parallel. Otherwise try to
6347 get chain for loads group using vect_shift_permute_load_chain. */
6348 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6349 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6350 || pow2p_hwi (size)
6351 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6352 gsi, &result_chain))
6353 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6354 vect_record_grouped_load_vectors (stmt_info, result_chain);
6355 result_chain.release ();
6356 }
6357
6358 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6359 generated as part of the vectorization of STMT_INFO. Assign the statement
6360 for each vector to the associated scalar statement. */
6361
6362 void
6363 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6364 vec<tree> result_chain)
6365 {
6366 vec_info *vinfo = stmt_info->vinfo;
6367 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6368 unsigned int i, gap_count;
6369 tree tmp_data_ref;
6370
6371 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6372 Since we scan the chain starting from it's first node, their order
6373 corresponds the order of data-refs in RESULT_CHAIN. */
6374 stmt_vec_info next_stmt_info = first_stmt_info;
6375 gap_count = 1;
6376 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6377 {
6378 if (!next_stmt_info)
6379 break;
6380
6381 /* Skip the gaps. Loads created for the gaps will be removed by dead
6382 code elimination pass later. No need to check for the first stmt in
6383 the group, since it always exists.
6384 DR_GROUP_GAP is the number of steps in elements from the previous
6385 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6386 correspond to the gaps. */
6387 if (next_stmt_info != first_stmt_info
6388 && gap_count < DR_GROUP_GAP (next_stmt_info))
6389 {
6390 gap_count++;
6391 continue;
6392 }
6393
6394 while (next_stmt_info)
6395 {
6396 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6397 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6398 copies, and we put the new vector statement in the first available
6399 RELATED_STMT. */
6400 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6401 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6402 else
6403 {
6404 if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6405 {
6406 stmt_vec_info prev_stmt_info
6407 = STMT_VINFO_VEC_STMT (next_stmt_info);
6408 stmt_vec_info rel_stmt_info
6409 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6410 while (rel_stmt_info)
6411 {
6412 prev_stmt_info = rel_stmt_info;
6413 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6414 }
6415
6416 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6417 }
6418 }
6419
6420 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6421 gap_count = 1;
6422 /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6423 put the same TMP_DATA_REF as its vectorized statement; otherwise
6424 get the next data-ref from RESULT_CHAIN. */
6425 if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6426 break;
6427 }
6428 }
6429 }
6430
6431 /* Function vect_force_dr_alignment_p.
6432
6433 Returns whether the alignment of a DECL can be forced to be aligned
6434 on ALIGNMENT bit boundary. */
6435
6436 bool
6437 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6438 {
6439 if (!VAR_P (decl))
6440 return false;
6441
6442 if (decl_in_symtab_p (decl)
6443 && !symtab_node::get (decl)->can_increase_alignment_p ())
6444 return false;
6445
6446 if (TREE_STATIC (decl))
6447 return (alignment <= MAX_OFILE_ALIGNMENT);
6448 else
6449 return (alignment <= MAX_STACK_ALIGNMENT);
6450 }
6451
6452
6453 /* Return whether the data reference DR_INFO is supported with respect to its
6454 alignment.
6455 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6456 it is aligned, i.e., check if it is possible to vectorize it with different
6457 alignment. */
6458
6459 enum dr_alignment_support
6460 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6461 bool check_aligned_accesses)
6462 {
6463 data_reference *dr = dr_info->dr;
6464 stmt_vec_info stmt_info = dr_info->stmt;
6465 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6466 machine_mode mode = TYPE_MODE (vectype);
6467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6468 struct loop *vect_loop = NULL;
6469 bool nested_in_vect_loop = false;
6470
6471 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6472 return dr_aligned;
6473
6474 /* For now assume all conditional loads/stores support unaligned
6475 access without any special code. */
6476 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6477 if (gimple_call_internal_p (stmt)
6478 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6479 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6480 return dr_unaligned_supported;
6481
6482 if (loop_vinfo)
6483 {
6484 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6485 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6486 }
6487
6488 /* Possibly unaligned access. */
6489
6490 /* We can choose between using the implicit realignment scheme (generating
6491 a misaligned_move stmt) and the explicit realignment scheme (generating
6492 aligned loads with a REALIGN_LOAD). There are two variants to the
6493 explicit realignment scheme: optimized, and unoptimized.
6494 We can optimize the realignment only if the step between consecutive
6495 vector loads is equal to the vector size. Since the vector memory
6496 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6497 is guaranteed that the misalignment amount remains the same throughout the
6498 execution of the vectorized loop. Therefore, we can create the
6499 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6500 at the loop preheader.
6501
6502 However, in the case of outer-loop vectorization, when vectorizing a
6503 memory access in the inner-loop nested within the LOOP that is now being
6504 vectorized, while it is guaranteed that the misalignment of the
6505 vectorized memory access will remain the same in different outer-loop
6506 iterations, it is *not* guaranteed that is will remain the same throughout
6507 the execution of the inner-loop. This is because the inner-loop advances
6508 with the original scalar step (and not in steps of VS). If the inner-loop
6509 step happens to be a multiple of VS, then the misalignment remains fixed
6510 and we can use the optimized realignment scheme. For example:
6511
6512 for (i=0; i<N; i++)
6513 for (j=0; j<M; j++)
6514 s += a[i+j];
6515
6516 When vectorizing the i-loop in the above example, the step between
6517 consecutive vector loads is 1, and so the misalignment does not remain
6518 fixed across the execution of the inner-loop, and the realignment cannot
6519 be optimized (as illustrated in the following pseudo vectorized loop):
6520
6521 for (i=0; i<N; i+=4)
6522 for (j=0; j<M; j++){
6523 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6524 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6525 // (assuming that we start from an aligned address).
6526 }
6527
6528 We therefore have to use the unoptimized realignment scheme:
6529
6530 for (i=0; i<N; i+=4)
6531 for (j=k; j<M; j+=4)
6532 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6533 // that the misalignment of the initial address is
6534 // 0).
6535
6536 The loop can then be vectorized as follows:
6537
6538 for (k=0; k<4; k++){
6539 rt = get_realignment_token (&vp[k]);
6540 for (i=0; i<N; i+=4){
6541 v1 = vp[i+k];
6542 for (j=k; j<M; j+=4){
6543 v2 = vp[i+j+VS-1];
6544 va = REALIGN_LOAD <v1,v2,rt>;
6545 vs += va;
6546 v1 = v2;
6547 }
6548 }
6549 } */
6550
6551 if (DR_IS_READ (dr))
6552 {
6553 bool is_packed = false;
6554 tree type = (TREE_TYPE (DR_REF (dr)));
6555
6556 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6557 && (!targetm.vectorize.builtin_mask_for_load
6558 || targetm.vectorize.builtin_mask_for_load ()))
6559 {
6560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6561
6562 /* If we are doing SLP then the accesses need not have the
6563 same alignment, instead it depends on the SLP group size. */
6564 if (loop_vinfo
6565 && STMT_SLP_TYPE (stmt_info)
6566 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6567 * (DR_GROUP_SIZE
6568 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6569 TYPE_VECTOR_SUBPARTS (vectype)))
6570 ;
6571 else if (!loop_vinfo
6572 || (nested_in_vect_loop
6573 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6574 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6575 return dr_explicit_realign;
6576 else
6577 return dr_explicit_realign_optimized;
6578 }
6579 if (!known_alignment_for_access_p (dr_info))
6580 is_packed = not_size_aligned (DR_REF (dr));
6581
6582 if (targetm.vectorize.support_vector_misalignment
6583 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6584 /* Can't software pipeline the loads, but can at least do them. */
6585 return dr_unaligned_supported;
6586 }
6587 else
6588 {
6589 bool is_packed = false;
6590 tree type = (TREE_TYPE (DR_REF (dr)));
6591
6592 if (!known_alignment_for_access_p (dr_info))
6593 is_packed = not_size_aligned (DR_REF (dr));
6594
6595 if (targetm.vectorize.support_vector_misalignment
6596 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6597 return dr_unaligned_supported;
6598 }
6599
6600 /* Unsupported. */
6601 return dr_unaligned_unsupported;
6602 }